Ejemplo n.º 1
0
def get_allele_count_trajectories(pname,
                                  samplenames,
                                  fragment,
                                  use_PCR1=1,
                                  VERBOSE=0):
    '''Get allele counts for a single patient sample'''
    if VERBOSE >= 1:
        print 'Getting allele counts:', pname, fragment

    from hivwholeseq.patients.filenames import get_initial_reference_filename, \
            get_allele_counts_filename

    refseq = SeqIO.read(get_initial_reference_filename(pname, fragment),
                        'fasta')
    fns = []
    samplenames_out = []
    for samplename_pat in samplenames:

        # PCR1 filter here
        fn1 = get_allele_counts_filename(pname,
                                         samplename_pat,
                                         fragment,
                                         PCR=1)
        fn2 = get_allele_counts_filename(pname,
                                         samplename_pat,
                                         fragment,
                                         PCR=2)
        if use_PCR1 == 0:
            for PCR, fn in enumerate((fn1, fn2), 1):
                if os.path.isfile(fn):
                    fns.append(fn)
                    samplenames_out.append((samplename_pat, PCR))
                    if VERBOSE >= 3:
                        print samplename_pat, PCR

        elif use_PCR1 == 1:
            if os.path.isfile(fn1):
                fns.append(fn1)
                samplenames_out.append((samplename_pat, 1))
                if VERBOSE >= 3:
                    print samplename_pat, 1
            elif os.path.isfile(fn2):
                fns.append(fn2)
                samplenames_out.append((samplename_pat, 2))
                if VERBOSE >= 3:
                    print samplename_pat, 2
        elif use_PCR1 == 2:
            if os.path.isfile(fn1):
                fns.append(fn1)
                samplenames_out.append((samplename_pat, 1))
                if VERBOSE >= 3:
                    print samplename_pat, 1

    act = np.zeros((len(fns), len(alpha), len(refseq)), int)
    for i, fn in enumerate(fns):
        # Average directly over read types?
        act[i] = np.load(fn).sum(axis=0)

    return (samplenames_out, act)
Ejemplo n.º 2
0
def get_allele_frequency_trajectories(pname,
                                      samples,
                                      fragment,
                                      qual_min=30,
                                      VERBOSE=0):
    '''Scan the reads of all samples and write to a single file'''
    if VERBOSE >= 1:
        print 'Getting allele frequency trajectories:', pname, fragment

    from hivwholeseq.patients.filenames import get_initial_reference_filename, \
            get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \
            get_allele_count_trajectories_filename
    from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \
            get_allele_counts_insertions_from_file_unfiltered, \
            filter_nus

    refseq = SeqIO.read(get_initial_reference_filename(pname, fragment),
                        'fasta')

    # Prepare output data structures
    cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int)
    nus_traj = np.zeros((len(samples), len(alpha), len(refseq)))

    for it, sample in enumerate(samples):
        if VERBOSE >= 2:
            print pname, it, sample

        input_filename = get_mapped_to_initial_filename(pname,
                                                        sample,
                                                        fragment,
                                                        type='bam')
        (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(
            input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
        # Take the total counts, blending in the read types
        cou = counts.sum(axis=0)
        cos_traj[it] = cou

        # Take the filtered frequencies, blending in the read types
        nu = filter_nus(counts)
        nus_traj[it] = nu

    #FIXME: test, etc.

    return (cos_traj, nus_traj)
def make_index_and_hash(pname, fragment, VERBOSE=0):
    '''Make index and hash files for reference'''
    # 1. Make genome index file
    stdout = sp.check_output([stampy_bin,
                              '--overwrite',
                              '--species="HIV fragment '+fragment+'"',
                              '-G', get_initial_index_filename(pname, fragment, ext=False),
                              get_initial_reference_filename(pname, fragment),
                              ],
                              stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built index: '+pname+' '+fragment
    
    # 2. Build a hash file
    stdout = sp.check_output([stampy_bin,
                              '--overwrite',
                              '-g', get_initial_index_filename(pname, fragment, ext=False),
                              '-H', get_initial_hash_filename(pname, fragment, ext=False),
                              ],
                              stderr=sp.STDOUT)
    if VERBOSE:
        print 'Built hash: '+pname+' '+fragment
Ejemplo n.º 4
0
    if VERBOSE >= 3:
        print 'fragments', fragments

    for fragment in fragments:
        inses = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print fragment, samplename

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')

            fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR)
            if not os.path.isfile(fn):
                warn('No BAM file found', NoDataWarning)
                continue

            _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
            inses.append(inse)

            if save_to_file:
                fn_out = sample.get_insertions_filename(fragment, PCR=PCR,
                                                        qual_min=qual_min)
                save_insertions(fn_out, inse)

                if VERBOSE >= 2:
Ejemplo n.º 5
0
                          maxreads=maxreads,
                          use_tests=use_tests)
        sys.exit()

    counts_all = []
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            pname = sample.patient

            if VERBOSE >= 2:
                print pname, fragment, samplename

            refseq = SeqIO.read(
                get_initial_reference_filename(pname, fragment), 'fasta')

            fn_out = sample.get_allele_cocounts_filename(fragment,
                                                         PCR=PCR,
                                                         qual_min=qual_min,
                                                         compressed=True)
            fn = sample.get_mapped_filtered_filename(
                fragment, PCR=PCR, decontaminated=True)  #FIXME
            if save_to_file:
                cocount = gac(fn,
                              len(refseq),
                              maxreads=maxreads,
                              VERBOSE=VERBOSE,
                              qual_min=qual_min,
                              use_tests=use_tests)
Ejemplo n.º 6
0
 def get_reference_filename(self, fragment, format='fasta'):
     '''Get filename of the reference for mapping'''
     from hivwholeseq.patients.filenames import get_initial_reference_filename
     return get_initial_reference_filename(self.name, fragment, format)
            print sample_seq.adapter

        cons_rec = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                              'fasta')
        frag_spec = sample_seq.regions_complete[\
                            sample_seq.regions_generic.index(fragment)]

        # Complement PCR2 initial reference with tails from a later sample
        if int(sample_seq.PCR) == 2:
            (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec, patient,
                                                              fragment,
                                                              samplen,
                                                              VERBOSE=VERBOSE)

        conss = str(cons_rec.seq)
        output_filename = get_initial_reference_filename(pname, fragment)

        seq_in = SeqRecord(Seq(conss, unambiguous_dna),
                           id='cons_init_p'+pname+'_'+frag_spec,
                           name='cons_init_p'+pname+'_'+frag_spec,
                           description='Initial consensus of patient '+pname+\
                                       ', fragment '+frag_spec)

        # If absent, just copy the thing over
        if not os.path.isfile(output_filename):
            if VERBOSE >= 1:
                print pname+': initial consensus file created for sample', \
                        sample_seq.name, 'fragment', fragment
            SeqIO.write(seq_in, output_filename, 'fasta')

        # if present, check whether the sequences are the same (if so, no
    if pnames is not None:
        samples = samples.loc[samples.patient.isin(pnames)]
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    for samplename, sample in samples.iterrows():
        if VERBOSE >= 1:
            print samplename

        sample = SamplePat(sample)
        pname = sample.patient
        conss_genomewide = SeqIO.read(
            get_initial_reference_filename(pname, 'genomewide'), 'fasta')

        # Collect the allele counts (where possible)
        acs = []
        for fragment in ['F' + str(i) for i in xrange(1, 7)]:
            try:
                ref = ''.join(
                    SeqIO.read(get_initial_reference_filename(pname, fragment),
                               'fasta'))
                ac = sample.get_allele_counts(fragment, merge_read_types=False)
                acs.append((fragment, ref, ac))
            except IOError:
                continue

        if not len(acs):
            if VERBOSE >= 1:
Ejemplo n.º 9
0
def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
Ejemplo n.º 10
0
 def get_reference_filename(self, fragment, format='fasta'):
     '''Get filename of the reference for mapping'''
     from hivwholeseq.patients.filenames import get_initial_reference_filename
     return get_initial_reference_filename(self.name, fragment, format)
Ejemplo n.º 11
0
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
    samples = lssp()
    if pnames is not None:
        samples = samples.loc[samples.patient.isin(pnames)]
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    for samplename, sample in samples.iterrows():
        if VERBOSE >= 1:
            print samplename

        sample = SamplePat(sample)
        pname = sample.patient
        conss_genomewide = SeqIO.read(get_initial_reference_filename(pname, 'genomewide'), 'fasta')

        # Collect the allele counts (where possible)
        acs = []
        for fragment in ['F'+str(i) for i in xrange(1, 7)]:
            try:
                ref = ''.join(SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta'))
                ac = sample.get_allele_counts(fragment, merge_read_types=False)
                acs.append((fragment, ref, ac))
            except IOError:
                continue

        if not len(acs):
            if VERBOSE >= 1:
                print 'No data found: skipping'
            continue
Ejemplo n.º 13
0
        cons_rec = SeqIO.read(
            get_consensus_filename(data_folder, adaID, fragment), 'fasta')
        frag_spec = sample_seq.regions_complete[\
                            sample_seq.regions_generic.index(fragment)]

        # Complement PCR2 initial reference with tails from a later sample
        if int(sample_seq.PCR) == 2:
            (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec,
                                                              patient,
                                                              fragment,
                                                              samplen,
                                                              VERBOSE=VERBOSE)

        conss = str(cons_rec.seq)
        output_filename = get_initial_reference_filename(pname, fragment)

        seq_in = SeqRecord(Seq(conss, unambiguous_dna),
                           id='cons_init_p'+pname+'_'+frag_spec,
                           name='cons_init_p'+pname+'_'+frag_spec,
                           description='Initial consensus of patient '+pname+\
                                       ', fragment '+frag_spec)

        # If absent, just copy the thing over
        if not os.path.isfile(output_filename):
            if VERBOSE >= 1:
                print pname+': initial consensus file created for sample', \
                        sample_seq.name, 'fragment', fragment
            SeqIO.write(seq_in, output_filename, 'fasta')

        # if present, check whether the sequences are the same (if so, no