Esempio n. 1
0
def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0,
                           filtered=False):
    '''Get the distance of reads from their consensus'''
    reffilename = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=filtered)

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        n_pairs = 0
        read_pairs = []
        for (i, rp) in enumerate(pair_generator(bamfile)):
            if n_pairs >= maxreads:
                break

            r1 = rp[0]
            if not r1.is_proper_pair:
                continue

            read_pairs.append(rp)
            n_pairs += 1

        ds = get_distance_from_reference(ref, read_pairs, threshold=30)

    h = np.bincount(ds)
    return h
Esempio n. 2
0
def get_insert_size_distribution(data_folder, adaID, fragment, bins=None,
                                 maxreads=-1, VERBOSE=0, density=True):
    '''Get the distribution of insert sizes'''

    if maxreads <= 0:
        maxreads = 1e6
    
    insert_sizes = np.zeros(maxreads, np.int16)

    # Open BAM file
    if fragment == 'premapped':
        bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    else:
        bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                          filtered=True)

    # Convert from SAM if necessary
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    # Open file
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        # Iterate over single reads (no linkage info needed)
        n_written = 0
        for i, reads in enumerate(pair_generator(bamfile)):

            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)

            # If unmapped or unpaired, mini, or insert size mini, discard
            if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
               reads[1].is_unmapped or (not reads[1].is_proper_pair):
                continue
            
            # Store insert size
            i_fwd = reads[0].is_reverse
            insert_sizes[i] = reads[i_fwd].isize
            n_written += 1

    insert_sizes = insert_sizes[:n_written]
    insert_sizes.sort()

    # Bin it
    if bins is None:
        h = np.histogram(insert_sizes, density=density)
    else:
        h = np.histogram(insert_sizes, bins=bins, density=density)

    return insert_sizes, h
Esempio n. 3
0
def get_coverage_tuples(data_folder, adaID, fragment, mtuples,
                       maxreads=-1, VERBOSE=0):
    '''Get the joint coverage of a list of positions'''
    # Prepare data structures
    mtuples = [np.asarray(tup, int) for tup in mtuples]
    coverage = np.zeros(len(mtuples), int)

    # TODO: what to do if it is covered multiple times? or only some sites?
    covs_pair = [np.zeros(len(tup), bool) for tup in mtuples]

    # Open BAM
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
    
        # Iterate over all pairs
        for irp, reads in enumerate(pair_generator(bamfile)):

            # Limit to the first reads
            if irp == maxreads:
                if VERBOSE:
                    print 'Max reads reached:', maxreads
                break

            if VERBOSE >= 3:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Reinitialize temporary structure
            for cov_pair in covs_pair: cov_pair[:] = False

            # Look in both reads
            for read in reads:

                # NOTE: deletions count as covered, because in principle
                # we see that part of the reference
                cigar = read.cigar
                ref_start = read.pos
                ref_end = ref_start + sum(bl for (bt, bl) in cigar if bt in (0, 2))

                # Use numba to accelerate? better not
                if False:
                    add_read(ref_start, ref_end, mtuples, covs_pair)
                else:
                    for cov_pair, mtuple in izip(covs_pair, mtuples):
                        cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True

            # Check which tuples are fully covered
            for i, cov_pair in enumerate(covs_pair):
                if cov_pair.all():
                    coverage[i] += 1

    return coverage 
Esempio n. 4
0
def split_reads(data_folder,
                adaID,
                fragment,
                chunk_size=10000,
                maxreads=-1,
                VERBOSE=0):
    '''Split reads into chunks for mapping'''

    input_filename = get_divided_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:
        if VERBOSE:
            if maxreads == -1:
                n_reads = get_number_reads_open(bamfile) // 2
            else:
                n_reads = maxreads

            print 'Expected number of chunks:', 1 + (n_reads // chunk_size)

        chunk_number = 0
        chunkfile = None
        for irp, read_pair in enumerate(pair_generator(bamfile)):
            if irp == maxreads:
                break

            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            if not (irp % chunk_size):
                if chunkfile is not None:
                    chunkfile.close()
                chunk_number += 1
                chunk_filename = get_divided_filename(data_folder,
                                                      adaID,
                                                      fragment,
                                                      type='bam',
                                                      chunk=chunk_number)
                chunkfile = pysam.Samfile(chunk_filename,
                                          'wb',
                                          template=bamfile)
                if VERBOSE >= 2:
                    print 'Chunk n', chunk_number, 'started'

            chunkfile.write(read_pair[0])
            chunkfile.write(read_pair[1])

        if chunkfile is not None:
            chunkfile.close()

    if VERBOSE:
        print 'Chunking finished'
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0):
    '''Split reads into chunks for mapping'''

    input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:
        if VERBOSE:
            if maxreads == -1:
                n_reads = get_number_reads_open(bamfile) // 2
            else:
                n_reads = maxreads

            print 'Expected number of chunks:', 1 + (n_reads // chunk_size)

        chunk_number = 0
        chunkfile = None
        for irp, read_pair in enumerate(pair_generator(bamfile)):
            if irp == maxreads:
                break

            if VERBOSE >= 2:
                if not ((irp+1) % 10000):
                    print irp+1

            if not (irp % chunk_size):
                if chunkfile is not None:
                    chunkfile.close()
                chunk_number += 1
                chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number)
                chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile)
                if VERBOSE >= 2:
                    print 'Chunk n', chunk_number, 'started'


            chunkfile.write(read_pair[0])
            chunkfile.write(read_pair[1])

        if chunkfile is not None:
            chunkfile.close()

    if VERBOSE:
        print 'Chunking finished'
    seq_run = args.run
    adaID = args.adaID
    VERBOSE = args.verbose

    # Specify the dataset
    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder

    # Get the BAM filename 
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')

    # Get unmapped reads and BLAST them
    reads_unmapped = []
    n_unmapped = 0
    with pysam.Samfile(input_filename, 'rb') as input_file:
        for reads in pair_generator(input_file):
            if not reads[0].is_unmapped:
                continue

            n_unmapped += 1

            # Take only the first part of read1, to make sure quality is high
            seq = reads[reads[1].is_read1].seq[:200]
            seqb = Seq(seq, IUPAC.ambiguous_dna)

            # Save to file, to test local blast
            reads_unmapped.append(reads[reads[1].is_read1])
            if len(reads_unmapped) >= 100:
                break

            continue
Esempio n. 7
0
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True,
                 plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          frag_gen,
                                          type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: ' + adaID + ', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      frag_gen,
                                      type='bam',
                                      filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(
        data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros(
                (len(ref) // binsize + 1, n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break

                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair ' + str(irp) +
                                     ': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue

                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair ' + read1.qname + ': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads,
                                             contaminants,
                                             VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads,
                                      match_len_min=match_len_min,
                                      trim_left=trim_bad_cigars,
                                      trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(
            data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID ' + adaID + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Suspect contaminations:\t' + str(n_suspect) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')

    if plot:
        plot_distance_histogram(data_folder,
                                adaID,
                                frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder,
                                               adaID,
                                               frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)
Esempio n. 8
0
def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_reads(data_folder,
                 adaID,
                 fragment,
                 VERBOSE=0,
                 maxreads=-1,
                 contaminants=None,
                 n_cycles=600,
                 max_mismatches=30,
                 susp_mismatches=20,
                 summary=True, plot=False):
    '''Filter the reads to good chunks'''
    frag_gen = fragment[:2]

    reffilename = get_consensus_filename(data_folder, adaID, frag_gen)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                      filtered=False)
    if not os.path.isfile(bamfilename):
        samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                          filtered=False)
        if os.path.isfile(samfilename):
            convert_sam_to_bam(bamfilename)
        else:
            if VERBOSE >= 1:
                print 'ERROR: '+adaID+', mapped file not found.'
            return

    outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                     filtered=True)
    suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen)
    trashfilename = outfilename[:-4]+'_trashed.bam'
 
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            # Iterate over all pairs
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_suspect = 0
            n_mismapped_edge = 0
            n_badcigar = 0
            histogram_distance_from_consensus = np.zeros(n_cycles + 1, int)
            binsize = 200
            histogram_dist_along = np.zeros((len(ref) // binsize + 1,
                                             n_cycles + 1), int)
            for irp, reads in enumerate(pair_generator(bamfile)):

                # Limit to the first reads
                if irp == maxreads:
                    break
            
                # Assign names
                (read1, read2) = reads
                i_fwd = reads[0].is_reverse

                # Check a few things to make sure we are looking at paired reads
                if read1.qname != read2.qname:
                    n_wrongname += 1
                    raise ValueError('Read pair '+str(irp)+': reads have different names!')

                # Ignore unmapped reads
                if read1.is_unmapped or read2.is_unmapped:
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': unmapped'
                    n_unmapped += 1
                    map(trashfile.write, reads)
                    continue
            
                # Ignore not properly paired reads (this includes mates sitting on
                # different fragments)
                if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                    if VERBOSE >= 2:
                        print 'Read pair '+read1.qname+': not properly paired'
                    n_unpaired += 1
                    map(trashfile.write, reads)
                    continue

                # Mismappings are sometimes at fragment edges:
                # Check for overhangs beyond the edge
                skip = check_overhanging_reads(reads, len(ref))
                if skip:
                    n_mismapped_edge += 1
                    map(trashfile.write, reads)
                    continue
                    
                # Mismappings are often characterized by many mutations:
                # check the number of mismatches of the whole pair and skip reads with too many
                dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
                histogram_distance_from_consensus[dc.sum()] += 1
                hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize
                histogram_dist_along[hbin, dc.sum()] += 1
                if (dc.sum() > max_mismatches):
                    if VERBOSE >= 2:
                        print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\
                                'Read pair '+read1.qname+': too many mismatches '+\
                                '('+str(dc[0])+' + '+str(dc[1])+')'
                    n_mutator += 1
                    map(trashfile.write, reads)
                    continue

                # Check for contamination from other PCR plates. Typically,
                # contamination happens for only one fragment, whereas superinfection
                # happens for all. At this stage, we can only give clues about
                # cross-contamination, the rest will be done in a script downstream
                # (here we could TAG suspicious reads for contamination)
                elif (dc.sum() > susp_mismatches):
                    if contaminants is not None:
                        skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE)
                    else:
                        skip = True
                    if skip:
                        n_suspect += 1
                        map(suspfile.write, reads)
                        continue

                # Trim the bad CIGARs from the sides, if there are any good ones
                skip = trim_bad_cigar(reads, match_len_min=match_len_min,
                                       trim_left=trim_bad_cigars,
                                       trim_right=trim_bad_cigars)
                if skip:
                    n_badcigar += 1
                    map(trashfile.write, reads)
                    continue

                # TODO: we might want to incorporate some more stringent
                # criterion here, to avoid short reads, cross-overhang, etc.

                # Write the output
                n_good += 1
                map(outfile.write, reads)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Mispapped at edge:', n_mismapped_edge
        print 'Many-mutations:', n_mutator
        print 'Suspect contaminations:', n_suspect
        print 'Bad CIGARs:', n_badcigar

    if summary:
        summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
        with open(summary_filename, 'a') as f:
            f.write('Filter results: adaID '+adaID+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Suspect contaminations:\t'+str(n_suspect)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')


    if plot:
        plot_distance_histogram(data_folder, adaID, frag_gen,
                                histogram_distance_from_consensus,
                                savefig=True)

        plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen,
                                               len(ref),
                                               histogram_dist_along,
                                               binsize=binsize,
                                               savefig=True)
Esempio n. 10
0
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0):
    '''Get histogram of minimal distance of reads from consensi'''

    conssi = map(''.join, consensi)
    m = np.zeros(len(consensi), int)
    n_good = 0
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if n_good == maxreads:
                break

            if VERBOSE >= 3:
                print n_good + 1, 'Checking mindist for:', reads[0].qname,

            # Assign names
            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair ' + str(irp) +
                                 ': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                if VERBOSE >= 2:
                    print 'Read pair ' + read1.qname + ': unmapped'
                continue

            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                if VERBOSE >= 2:
                    print 'Read pair ' + read1.qname + ': not properly paired'
                continue

            n_good += 1

            # Get all distances
            ds_pair = np.zeros_like(m)
            for ic, consensus in enumerate(consensi):
                conss = conssi[ic]
                dpair = 0
                for read in reads:
                    seq = read.seq
                    ali = align_overlap(conss, seq)

                    # NOTE: it is possible that we start before conss' start or end after
                    # its end, but that IS evidence that it's not contamination from there.

                    pos = conss.find(ali[1].replace('-', ''))
                    alim0 = np.fromstring(ali[1], 'S1')
                    alim1 = np.fromstring(ali[2], 'S1')

                    # Score subst
                    d = ((alim0 != alim1) & (alim0 != '-') &
                         (alim1 != '-')).sum()

                    # Score insertions
                    gaps = alim0 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders += alim0[0] == '-'
                        n_gaps_borders += alim0[-1] == '-'
                        n_insertions = n_gaps_borders // 2
                        d += n_insertions

                    # Score deletions
                    gaps = alim1 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders -= alim1[0] == '-'
                        n_gaps_borders -= alim1[-1] == '-'
                        n_deletions = n_gaps_borders // 2
                        d += n_deletions

                    dpair += d

                ds_pair[ic] = dpair

                if VERBOSE >= 3:
                    print 'OK',

            m[ds_pair.argmin()] += 1
            if VERBOSE >= 3:
                print ''

    return m
def filter_contamination(
    bamfilename,
    bamfilename_out,
    contseqs,
    samplename,
    VERBOSE=0,
    deltascore_max_self=60,
    deltascore_max_other=24,
    maxreads=-1,
    **kwargs
):
    """Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    """
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if "score_match" in kwargs:
        score_match = kwargs["score_match"]
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam"

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")"

    with pysam.Samfile(bamfilename, "rb") as bamfile:
        with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile(
            bamfilename_trash, "wb", template=bamfile
        ) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write("\x1b[1A")
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print "Read is very close to its own consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read")
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print "Read is closest to its consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                    # The read may come from another consensus (contamination)
                    elif delta_read <= deltascore_max_other:
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print "Contaminated read found! Good:", n_good, "cont:", sum(
                                n_cont.itervalues()
                            ), "sources:", n_cont

                        if VERBOSE >= 3:
                            print "Read is contaminated by", contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read")
                            print ""
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                        if VERBOSE >= 2:
                            print ""

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print "Read is close to nothing really", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
Esempio n. 12
0
def filter_contamination(bamfilename,
                         bamfilename_out,
                         contseqs,
                         samplename,
                         VERBOSE=0,
                         deltascore_max_self=60,
                         deltascore_max_other=24,
                         maxreads=-1,
                         **kwargs):
    '''Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    '''
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if 'score_match' in kwargs:
        score_match = kwargs['score_match']
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam'

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print 'Scanning reads (' + str(
            get_number_reads(bamfilename) // 2) + ')'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \
             pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write('\x1b[1A')
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1,
                     alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print 'Read is very close to its own consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1,
                         ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(),
                                                 key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print 'Read is closest to its consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                    # The read may come from another consensus (contamination)
                    elif (delta_read <= deltascore_max_other):
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print 'Contaminated read found! Good:', n_good, 'cont:', sum(
                                n_cont.itervalues()), 'sources:', n_cont

                        if VERBOSE >= 3:
                            print 'Read is contaminated by', contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='self',
                                                      name2='read')
                            print ''
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                        if VERBOSE >= 2:
                            print ''

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print 'Read is close to nothing really', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
Esempio n. 13
0
        # read file
        bamfilename = get_mapped_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam',
                                          filtered=True)

        if not os.path.isfile(bamfilename):
            convert_sam_to_bam(bamfilename)
        bamfile = pysam.Samfile(bamfilename, 'rb')

        # Get the coverage for reads which have long insert sizes
        # (to be sure about their identity)
        cov_new = 0
        cov_old = 0
        for i_pairs, reads in enumerate(pair_generator(bamfile)):
            if i_pairs > 5000000:
                break

            if reads[0].isize < 300:
                continue

            for read in reads:
                if read.seq.find(primer_new) != -1:
                    cov_new += 1
                if read.seq.find(primer_old) != -1:
                    cov_old += 1

        print 'old:', cov_old, 'new:', cov_new

        bamfile.close()
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0):
    '''Get histogram of minimal distance of reads from consensi'''

    conssi = map(''.join, consensi)
    m = np.zeros(len(consensi), int)
    n_good = 0
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if n_good == maxreads:
                break

            if VERBOSE >= 3:
                print n_good + 1, 'Checking mindist for:', reads[0].qname,

            # Assign names
            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair '+str(irp)+': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                if VERBOSE >= 2:
                    print 'Read pair '+read1.qname+': unmapped'
                continue
            
            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                if VERBOSE >= 2:
                    print 'Read pair '+read1.qname+': not properly paired'
                continue

            n_good += 1

            # Get all distances
            ds_pair = np.zeros_like(m)
            for ic, consensus in enumerate(consensi):
                conss = conssi[ic]
                dpair = 0
                for read in reads:
                    seq = read.seq
                    ali = align_overlap(conss, seq)
    
                    # NOTE: it is possible that we start before conss' start or end after
                    # its end, but that IS evidence that it's not contamination from there.
    
                    pos = conss.find(ali[1].replace('-', ''))
                    alim0 = np.fromstring(ali[1], 'S1')
                    alim1 = np.fromstring(ali[2], 'S1')
    
                    # Score subst
                    d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum()
    
                    # Score insertions
                    gaps = alim0 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders += alim0[0] == '-'
                        n_gaps_borders += alim0[-1] == '-'
                        n_insertions = n_gaps_borders // 2
                        d += n_insertions
    
                    # Score deletions
                    gaps = alim1 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders -= alim1[0] == '-'
                        n_gaps_borders -= alim1[-1] == '-'
                        n_deletions = n_gaps_borders // 2
                        d += n_deletions
    
                    dpair += d
    
                ds_pair[ic] = dpair

                if VERBOSE >= 3:
                    print 'OK',

            m[ds_pair.argmin()] += 1
            if VERBOSE >= 3:
                print ''

    return m
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
Esempio n. 16
0
def get_local_haplotypes(bamfilename,
                         start,
                         end,
                         VERBOSE=0,
                         maxreads=-1,
                         label=''):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    from collections import Counter
    haplotypes = Counter()

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter = extract_mapped_reads_subsample_open(bamfile,
                                                             maxreads,
                                                             VERBOSE=VERBOSE,
                                                             pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    if label:
                        sys.stdout.write(label + '\t')
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first: this is important because with our insert
            # size we know the fwd read starts <= the rev read
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(
                bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(
                bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            overlap_len = max(0, end_fwd - start_rev)

            # Various scenarios possible
            if start_fwd > start:
                continue

            if end_rev < end:
                continue

            # No single read covers the whole region AND (the insert has a whole
            # OR a very short overlap)
            if (end_fwd < end) and (start_rev > start) and (overlap_len < 20):
                continue

            # Now the good cases
            if (start_fwd <= start) and (end_fwd >= end):
                seq = trim_read_roi(reads[0], start, end)

            elif (start_rev <= start) and (end_rev >= end):
                seq = trim_read_roi(reads[1], start, end)

            else:
                seqs = [trim_read_roi(read, start, end) for read in reads]
                seq = merge_read_pair(*seqs)

            haplotypes[seq] += 1
            if VERBOSE >= 4:
                import ipdb
                ipdb.set_trace()

    if VERBOSE >= 2:
        if irp >= 10000:
            sys.stdout.write('\n')
            sys.stdout.flush()

    return haplotypes
def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0):
    '''Extract allele and insert counts from a bamfile'''

    # Read reference
    reffilename = get_consensus_filename(data_folder, adaID, fragment,
                                         trim_primers=True)
    refseq = SeqIO.read(reffilename, 'fasta')
    
    # Allele counts and inserts (TODO: compress this data?)
    # Note: the pair is of 2 types only, while the single reads usually are of 4
    counts = np.zeros((len(read_pair_types),
                       len(alpha), len(alpha),
                       len(refseq), len(refseq)), int)
    positions = np.zeros(501, int)
    ais = np.zeros_like(positions)
    # TODO: no inserts for now

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over read pairs
        for i, reads in enumerate(pair_generator(bamfile)):

            # Limit to some reads for testing
            if i > maxreads:
                if VERBOSE:
                    print 'Max read number reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10)):
                print (i+1) 

            # Divide by read 1/2 and forward/reverse
            js = reads[0].is_reverse
            count = counts[js]

            # List of mutations
            positions[:] = -1
            ais[:] = -1
            imut = 0

            # Collect from the pair of reads
            for read in reads:
        
                # Sequence and position
                # Note: stampy takes the reverse complement already
                seq = read.seq
                pos = read.pos
    
                # Iterate over CIGARs
                len_cig = len(read.cigar)
                for ic, (block_type, block_len) in enumerate(read.cigar):
    
                    # Check for pos: it should never exceed the length of the fragment
                    if (block_type in [0, 1, 2]) and (pos > len(refseq)):
                        raise ValueError('Pos exceeded the length of the fragment')
                
                    # Inline block
                    if block_type == 0:
 
                        # Get the mutations and add them
                        indb = map(alphal.index, seq)
                        positions[imut: imut + len(indb)] = \
                                pos + np.arange(len(indb))
                        ais[imut: imut + len(indb)] = indb
                        imut += len(indb)

                        # Chop off this block
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                            pos += block_len
 
                    # Deletion
                    elif block_type == 2:                
                        # Chop off pos, but not sequence
                        pos += block_len
                
                    # Insertion
                    # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                    # THEN the insert, FINALLY comes seq[391:]
                    elif block_type == 1:
                        # Chop off seq, but not pos
                        if ic != len_cig - 1:
                            seq = seq[block_len:]
                
                    # Other types of cigar?
                    else:
                        raise ValueError('CIGAR type '+str(block_type)+' not recognized')

            if VERBOSE >= 4:
                for pos, ai in izip(positions, ais):
                    if pos == -1:
                        break
                    print pos, ai

            # Put the mutations into the matrix
            for ai1 in xrange(len(alpha)):
                for ai2 in xrange(len(alpha)):
                    coun = count[ai1, ai2]
                    pos1 = positions[ais == ai1]
                    if ai1 == ai2: pos2 = pos1
                    else: pos2 = positions[ais == ai2]
                    coords = np.meshgrid(pos1, pos2)
                    ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel()
                    coun.ravel()[ind] += 1                                        

    return counts
Esempio n. 18
0
            for sample in patient.itersamples():
                if VERBOSE >= 1:
                    print sample.name,

                if sample[fragment] not in ['ok', 'low']:
                    if VERBOSE >= 1:
                        print 'not "ok". skipping'
                    continue

                if VERBOSE >= 1:
                    print 'ok'
                               
                bamfilename = sample.get_mapped_filtered_filename(fragment, decontaminated=True)
                with pysam.Samfile(bamfilename, 'rb') as bamfile:
                    if maxreads == -1:
                        reads = pair_generator(bamfile)
                    else:
                        reads = extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                                    VERBOSE=VERBOSE)

                    dists[sample.name] = get_distance_reads_sequence(refseq, reads,
                                                                     VERBOSE=VERBOSE,
                                                                     score_match=3,
                                                                     score_mismatch=-3)


            hs = {}
            binmax = max(map(max, dists.itervalues()))
            bins = np.arange(0, binmax, 6)
            bincenters = 0.5 * (bins[1:] + bins[:-1])
            for samplename, dist in dists.iteritems():
Esempio n. 19
0
    seq_run = args.run
    adaID = args.adaID
    VERBOSE = args.verbose

    # Specify the dataset
    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder

    # Get the BAM filename
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')

    # Get unmapped reads and BLAST them
    reads_unmapped = []
    n_unmapped = 0
    with pysam.Samfile(input_filename, 'rb') as input_file:
        for reads in pair_generator(input_file):
            if not reads[0].is_unmapped:
                continue

            n_unmapped += 1

            # Take only the first part of read1, to make sure quality is high
            seq = reads[reads[1].is_read1].seq[:200]
            seqb = Seq(seq, IUPAC.ambiguous_dna)

            # Save to file, to test local blast
            reads_unmapped.append(reads[reads[1].is_read1])
            if len(reads_unmapped) >= 100:
                break

            continue
Esempio n. 20
0
def fish_distant_reads(bamfilename, ref,
                       min_mismatches=20, max_mismatches=30,
                       VERBOSE=0, maxseqs=-1):
    '''Fish distant reads from the trash'''
    import numpy as np

    from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord
    from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \
            get_distance_from_consensus

    distances = []
    seqs = []
    edges = []
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair '+str(irp)+': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                continue
            
            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                continue

            # Check for overhangs beyond the edge
            skip = check_overhanging_reads(reads, len(ref))
            if skip:
                continue

            # Fish out our reads
            dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
            if (min_mismatches <= dc.sum() <= max_mismatches):
                if VERBOSE >= 3:
                    print 'Gotcha!', reads[0].qname
                seqs.append(reads[0])
                seqs.append(reads[1])
                distances.append(dc)
                edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2)))
                        for read in reads]
                edges.append(edge)

                if len(seqs) // 2 == maxseqs:
                    if VERBOSE >= 2:
                        print 'Max seqs reached:', maxseqs
                    break

        seqs = list(pair_generator(reads_to_seqrecord(seqs)))

    distances = np.array(distances, int)
    return (distances, edges, seqs)
Esempio n. 21
0
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments,
                          maxreads=-1, VERBOSE=0,
                          minisize=100,
                          include_tests=False, summary=True):
    '''Trim reads and divide them into fragments'''
    if VERBOSE:
        print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\
                ' '.join(fragments)

    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Fragments used: '+' '.join(fragments)+'\n')

    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')
    smat = np.array(refseq, 'S1')
    len_reference = len(refseq)

    # Get the positions of fragment start/end, w/ and w/o primers
    frags_pos = get_fragment_positions(smat, fragments)
    store_reference_fragmented(data_folder, adaID, refseq,
                               dict(zip(fragments, frags_pos['trim'])))
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Primer positions (for fragments):\n')
            for (fragment, poss_full, poss_trim) in izip(fragments,
                                                         frags_pos['full'],
                                                         frags_pos['trim']):
                f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\
                                 ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n')
    write_fragment_positions(data_folder, adaID, fragments, frags_pos)

    # Get the positions of the unwanted outer primers (in case we DO nested PCR
    # for that fragment)
    # NOTE: the LTRs make no problem, because the rev outer primer of F6
    # is not in the reference anymore if F6 has undergone nested PCR
    # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting
    from re import findall
    primers_out = {'fwd': [], 'rev': []}
    for i, fr in enumerate(fragments):
        if (i != 0) and findall(r'F[2-6][a-z]?i', fr):
            primers_out['fwd'].append(fr[:-1]+'o')
        if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr):
            primers_out['rev'].append(fr[:-1]+'o')

    # Get all possible unambiguous primers for the unwanted outer primers
    from hivwholeseq.data.primers import primers_PCR
    from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
    primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])),
                                        'S1', ndmin=2)
                               for fr in primers_out['fwd']],
                       'rev': [np.array(map(list, eas(primers_PCR[fr][1])),
                                        'S1', ndmin=2)
                               for fr in primers_out['rev']],
                      }
    primers_out_pos = {'fwd': [], 'rev': []}
    if primers_out['fwd']:
        primers_out_pos['fwd'] = map(itemgetter(0),
                                     get_primer_positions(smat,
                                                          primers_out['fwd'], 'fwd'))
    if primers_out['rev']:
        primers_out_pos['rev'] = map(itemgetter(1),
                                     get_primer_positions(smat,
                                                          primers_out['rev'], 'rev'))

    # Input and output files
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        convert_sam_to_bam(input_filename)
    output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:

        try:
            file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile)
                            for ofn in output_filenames[:len(fragments)]]
    
            fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile)
            fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile)
            fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile)
            fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile)

            # Iterate over the mapped reads and assign fragments
            n_mapped = [0 for fragment in fragments]
            n_unmapped = 0
            n_crossfrag = 0
            n_ambiguous = 0
            n_outer = 0
            n_lowq = 0
            for irp, reads in enumerate(pair_generator(bamfile)):

                if irp == maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break

                if VERBOSE >= 2:
                    if not ((irp+1) % 10000):
                        print irp+1

                i_fwd = reads[0].is_reverse

                # If unmapped or unpaired, mini, or insert size mini, or
                # divergent read pair (fully cross-overlapping), discard
                if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \
                   reads[1].is_unmapped or (not reads[1].is_proper_pair) or \
                   (reads[0].rlen < 50) or (reads[1].rlen < 50) or \
                   (reads[i_fwd].isize < minisize):
                    if VERBOSE >= 3:
                        print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname
                    n_unmapped += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # If the insert is a misamplification from the outer primers
                # in fragments that underwent nested PCR,
                # trash it (it will have skewed amplification anyway). We cannot
                # find all of those, rather only the ones still carrying the
                # primer itself (some others have lost it while shearing). For
                # those, no matter what happens at the end (reading into adapters,
                # etc.), ONE of the reads in the pair will start exactly with one
                # outer primer: if the rev read with a rev primer, if the fwd
                # with a fwd one. Test all six.
                if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \
                   test_outer_primer(reads,
                                     primers_out_pos, primers_out_seq,
                                     len_reference):
                    if VERBOSE >= 3:
                        print 'Read pair from outer primer:', reads[0].qname
                    n_outer += 1
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # FIXME: the following becomes a bit harder when we mix parallel
                # PCRs, e.g. F5a+b, to get more product

                # Assign to a fragment now, so that primer trimming is faster 
                pair_identity = assign_to_fragment(reads, frags_pos['full'],
                                                   VERBOSE=VERBOSE)

                # 1. If no fragments are possible (e.g. one read crosses the
                # fragment boundary, they map to different fragments), dump it
                # into a special bucket
                if pair_identity == 'cross':
                    n_crossfrag += 1
                    fo_cm.write(reads[0])
                    fo_cm.write(reads[1])
                    continue

                # 2. If 2+ fragments are possible (tie), put into a special bucket
                # (essentially excluded, because we want two independent measurements
                # in the overlapping region, but we might want to recover them)
                elif pair_identity == 'ambiguous':
                    n_ambiguous += 1
                    fo_am.write(reads[0])
                    fo_am.write(reads[1])
                    continue

                # 3. If the intersection is a single fragment, good: trim the primers
                # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0
                n_frag = int(pair_identity)
                frag_pos = frags_pos['trim'][n_frag]
                if not np.isscalar(frag_pos[0]):
                    frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']]
                trashed_primers = trim_primers(reads, frag_pos,
                                               include_tests=include_tests)
                if trashed_primers or (reads[i_fwd].isize < 100):
                    n_unmapped += 1
                    if VERBOSE >= 3:
                        print 'Read pair is mismapped:', reads[0].qname
                    fo_um.write(reads[0])
                    fo_um.write(reads[1])
                    continue

                # Quality trimming: if no decently long pair survives, trash
                #trashed_quality = main_block_low_quality(reads, phred_min=20,
                #                                         include_tests=include_tests)
                trashed_quality = trim_low_quality(reads, phred_min=20,
                                                   include_tests=include_tests)
                if trashed_quality or (reads[i_fwd].isize < 100):
                    n_lowq += 1
                    if VERBOSE >= 3:
                        print 'Read pair has low phred quality:', reads[0].qname
                    fo_lq.write(reads[0])
                    fo_lq.write(reads[1])
                    continue

                # Check for cross-overhangs or COH (reading into the adapters)
                #        --------------->
                #    <-----------
                # In that case, trim to perfect overlap.
                if test_coh(reads, VERBOSE=False):
                    trim_coh(reads, trim=0, include_tests=include_tests)

                # Change coordinates into the fragmented reference (primer-trimmed)
                for read in reads:
                    read.pos -= frag_pos[0]
                    read.mpos -= frag_pos[0]

                # Here the tests
                if include_tests:
                    lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0]
                    if test_sanity(reads, n_frag, lfr):
                        print 'Tests failed:', reads[0].qname
                        import ipdb; ipdb.set_trace()

                # There we go!
                n_mapped[n_frag] += 1
                file_handles[n_frag].write(reads[0])
                file_handles[n_frag].write(reads[1])

        finally:
            for f in file_handles:
                f.close()
            fo_am.close()
            fo_cm.close()
            fo_um.close()
            fo_lq.close()


    if VERBOSE:
        print 'Trim and divide results: adaID '+adaID
        print 'Total:\t\t', irp
        print 'Mapped:\t\t', sum(n_mapped), n_mapped
        print 'Unmapped/unpaired/tiny:\t', n_unmapped
        print 'Outer primer\t', n_outer
        print 'Crossfrag:\t', n_crossfrag
        print 'Ambiguous:\t', n_ambiguous
        print 'Low-quality:\t', n_lowq

    # Write summary to file
    if summary:
        with open(get_divide_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim and divide results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp + 1)+'\n')
            f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n')
            f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n')
            f.write('Outer primer\t'+str(n_outer)+'\n')
            f.write('Crossfrag:\t'+str(n_crossfrag)+'\n')
            f.write('Ambiguous:\t'+str(n_ambiguous)+'\n')
            f.write('Low-quality:\t'+str(n_lowq)+'\n')
Esempio n. 22
0
def get_local_block(bamfilename,
                    start,
                    end,
                    VERBOSE=0,
                    maxreads=-1,
                    refroi=None):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        block = []

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter = extract_mapped_reads_subsample_open(bamfile,
                                                             maxreads,
                                                             VERBOSE=VERBOSE,
                                                             pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(
                bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(
                bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            if start_fwd > start:
                continue
            if end_rev < end:
                continue
            if (end_fwd < end) and (start_rev > start) and (start_rev >
                                                            end_fwd):
                continue

            if VERBOSE >= 3:
                print ' '.join(
                    map('{:>4d}'.format,
                        [start_fwd, end_fwd, start_rev, end_rev]))

            # Gather info from both reads, merge by putting ambiguous nucleotides
            seqs = []
            st_ens = [[start_fwd, end_fwd], [start_rev, end_rev]]
            for ir, read in enumerate(reads):
                (start_read, end_read) = st_ens[ir]
                if (end_read <= start) or (start_read >= end):
                    seqs.append(None)
                    continue

                seq = []
                pos_ref = start_read
                pos_read = 0
                start_block = max(start, start_read) - start
                end_block = min(end, end_read) - start
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl

                    elif bt == 2:
                        if pos_ref + bl > start:
                            st = max(0, start - pos_ref)
                            en = min(bl, end - pos_ref)
                            seq.append('-' * (en - st))
                            if pos_ref + bl >= end:
                                break
                        pos_ref += bl

                    elif bt == 0:
                        if pos_ref + bl > start:
                            st = max(0, start - pos_ref)
                            en = min(bl, end - pos_ref)
                            seq.append(read.seq[pos_read + st:pos_read + en])
                            if pos_ref + bl >= end:
                                break
                        pos_ref += bl
                        pos_read += bl
                seq = ''.join(seq)
                seqs.append((start_block, end_block, seq))

            # Merge sequences if both fwd and rev cover part of it
            if seqs[0] is None:
                seq = seqs[1][2]
            elif seqs[1] is None:
                seq = seqs[0][2]
            else:
                # The fwd read starts before the rev, because of our insert sizes
                end_block_fwd = seqs[0][1]
                start_block_rev = seqs[1][0]
                overlap = [
                    seqs[0][2][start_block_rev:],
                    seqs[1][2][:end_block_fwd - start_block_rev]
                ]

                # The two reads in a pair should have the same length in the overlap
                if len(overlap[0]) != len(overlap[1]):
                    if VERBOSE >= 3:
                        print 'WARNING:', reads[
                            0].qname, 'not same length in overlap!'
                    continue

                ol_fwd = np.fromstring(overlap[0], 'S1')
                ol_rev = np.fromstring(overlap[1], 'S1')

                ol_fwd[ol_fwd != ol_rev] = 'N'
                seq = seqs[0][2][:start_block_rev] + \
                      ol_fwd.tostring() + \
                      seqs[1][2][end_block_fwd - start_block_rev:]

            block.append(seq)

        if VERBOSE >= 2:
            print ''

    return block
Esempio n. 23
0
def get_coverage_tuples(data_folder,
                        adaID,
                        fragment,
                        mtuples,
                        maxreads=-1,
                        VERBOSE=0):
    '''Get the joint coverage of a list of positions'''
    # Prepare data structures
    mtuples = [np.asarray(tup, int) for tup in mtuples]
    coverage = np.zeros(len(mtuples), int)

    # TODO: what to do if it is covered multiple times? or only some sites?
    covs_pair = [np.zeros(len(tup), bool) for tup in mtuples]

    # Open BAM
    bamfilename = get_mapped_filename(data_folder,
                                      adaID,
                                      fragment,
                                      type='bam',
                                      filtered=True)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Iterate over all pairs
        for irp, reads in enumerate(pair_generator(bamfile)):

            # Limit to the first reads
            if irp == maxreads:
                if VERBOSE:
                    print 'Max reads reached:', maxreads
                break

            if VERBOSE >= 3:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Reinitialize temporary structure
            for cov_pair in covs_pair:
                cov_pair[:] = False

            # Look in both reads
            for read in reads:

                # NOTE: deletions count as covered, because in principle
                # we see that part of the reference
                cigar = read.cigar
                ref_start = read.pos
                ref_end = ref_start + sum(
                    bl for (bt, bl) in cigar if bt in (0, 2))

                # Use numba to accelerate? better not
                if False:
                    add_read(ref_start, ref_end, mtuples, covs_pair)
                else:
                    for cov_pair, mtuple in izip(covs_pair, mtuples):
                        cov_pair[(mtuple >= ref_start)
                                 & (mtuple < ref_end)] = True

            # Check which tuples are fully covered
            for i, cov_pair in enumerate(covs_pair):
                if cov_pair.all():
                    coverage[i] += 1

    return coverage
Esempio n. 24
0
                if VERBOSE >= 1:
                    print sample.name,

                if sample[fragment] not in ['ok', 'low']:
                    if VERBOSE >= 1:
                        print 'not "ok". skipping'
                    continue

                if VERBOSE >= 1:
                    print 'ok'

                bamfilename = sample.get_mapped_filtered_filename(
                    fragment, decontaminated=True)
                with pysam.Samfile(bamfilename, 'rb') as bamfile:
                    if maxreads == -1:
                        reads = pair_generator(bamfile)
                    else:
                        reads = extract_mapped_reads_subsample_open(
                            bamfile, maxreads, VERBOSE=VERBOSE)

                    dists[sample.name] = get_distance_reads_sequence(
                        refseq,
                        reads,
                        VERBOSE=VERBOSE,
                        score_match=3,
                        score_mismatch=-3)

            hs = {}
            binmax = max(map(max, dists.itervalues()))
            bins = np.arange(0, binmax, 6)
            bincenters = 0.5 * (bins[1:] + bins[:-1])
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1,
                         label=''):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    from collections import Counter
    haplotypes = Counter()

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter =  extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                              VERBOSE=VERBOSE,
                                                              pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    if label:
                        sys.stdout.write(label+'\t')
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first: this is important because with our insert
            # size we know the fwd read starts <= the rev read
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            overlap_len = max(0, end_fwd - start_rev)

            # Various scenarios possible
            if start_fwd > start:
                continue

            if end_rev < end:
                continue

            # No single read covers the whole region AND (the insert has a whole
            # OR a very short overlap)
            if (end_fwd < end) and (start_rev > start) and (overlap_len < 20):
                continue

            # Now the good cases
            if (start_fwd <= start) and (end_fwd >= end):
                seq = trim_read_roi(reads[0], start, end)

            elif (start_rev <= start) and (end_rev >= end):
                seq = trim_read_roi(reads[1], start, end)

            else:
                seqs = [trim_read_roi(read, start, end) for read in reads]
                seq = merge_read_pair(*seqs)

            haplotypes[seq] += 1
            if VERBOSE >= 4:
                import ipdb; ipdb.set_trace()

    if VERBOSE >= 2:
        if irp >= 10000:
            sys.stdout.write('\n')
            sys.stdout.flush()

    return haplotypes
Esempio n. 26
0
    # Read reference (fragmented)
    refseqs_raw = list(SeqIO.parse(get_last_reference(data_folder, adaID, ext=True), "fasta"))
    # Sort according to the chromosomal ordering
    refseqs = []
    for chromosome in chromosomes:
        for seq in refseqs_raw:
            if chromosome == seq.id:
                refseqs.append(seq)
                break
    refs = [np.array(refseq) for refseq in refseqs]

    # Prepare data structures
    muts_all = []

    # Iterate over all pairs
    for i_pairs, reads in enumerate(pair_generator(bamfile)):

        # Limit to the first reads
        if 2 * i_pairs >= maxreads:
            break

        # Log
        if VERBOSE and (not (i_pairs % 10000)):
            print i_pairs,

        # Assign names
        read1 = reads[0]
        read2 = reads[1]

        # Check a few things to make sure we are looking at paired reads
        if read1.qname != read2.qname:
def quality_score_along_reads_mapped(read_len, bamfilename,
                                     insertsize_range=[400, 1000],
                                     skipreads=0,
                                     maxreads=-1,
                                     randomreads=True,
                                     VERBOSE=0):
    '''Calculate the quality score along the reads'''
    from hivwholeseq.utils.mapping import trim_read_pair_crossoverhangs as trim_coh
    from hivwholeseq.utils.mapping import pair_generator

    quality = [[[] for j in xrange(read_len)] for i in xrange(2)]

    # Precompute conversion table
    SANGER_SCORE_OFFSET = ord("!")
    q_mapping = dict()
    for letter in range(0, 255):
        q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET

    # Iterate over all reads (using fast iterators)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        if not randomreads:
            reads_all = []
            for i, read_pair in enumerate(pair_generator(bamfile)):
                if i < skipreads:
                    continue
    
                if i == skipreads + maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break
    
                if VERBOSE and (not ((i + 1) % 10000)):
                    print i + 1

                reads_all.append(read_pair)

        else:
            reads_all = extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                            VERBOSE=VERBOSE)

        print len(reads_all)

        for reads in reads_all:

            # Check insert size
            read = reads[reads[0].is_reverse]
            if (read.is_unmapped or (not read.is_proper_pair) or \
                (read.isize < insertsize_range[0]) or \
                (read.isize >= insertsize_range[1])):
                continue

            trim_coh(reads, trim=5, include_tests=False)

            pos_read = 0
            for read in reads:
                ip = read.is_read2
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl
                    elif bt == 2:
                        pass
                    elif bt == 0:
                        qualb = read.qual[pos_read: pos_read + bl]
                        poss_read = np.arange(pos_read, pos_read + bl)
                        if read.is_reverse:
                            poss_read = len(read.seq) - 1 - poss_read

                        for j, qletter in izip(poss_read, qualb):
                            quality[ip][j].append(q_mapping[qletter])

    for qual in quality:
        for qpos in qual:
            qpos.sort()

    return quality