def get_other_consensi_seqrun(dataset, samplename, fragment, VERBOSE=0):
    '''Get consensi of other samples except the focal one'''
    consensi = []
    for (samplename_other, sample) in dataset.samples.iterrows():
        if samplename_other == samplename:
            continue

        sample = SampleSeq(sample)
        ref_fn = sample.get_consensus_filename(fragment)
        if not os.path.isfile(ref_fn):
            if VERBOSE >= 3:
                print samplename_other+': consensus for fragment '+fragment+' not found, skipping'
            continue
        consensi.append(SeqIO.read(ref_fn, 'fasta'))
    return consensi
Esempio n. 2
0
def get_other_consensi_seqrun(dataset, samplename, fragment, VERBOSE=0):
    '''Get consensi of other samples except the focal one'''
    consensi = []
    for (samplename_other, sample) in dataset.samples.iterrows():
        if samplename_other == samplename:
            continue

        sample = SampleSeq(sample)
        ref_fn = sample.get_consensus_filename(fragment)
        if not os.path.isfile(ref_fn):
            if VERBOSE >= 3:
                print samplename_other + ': consensus for fragment ' + fragment + ' not found, skipping'
            continue
        consensi.append(SeqIO.read(ref_fn, 'fasta'))
    return consensi
def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0):
    '''Complement consensus from PCR2 with wings from later PCR1 sample'''
    from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect

    found = False
    for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows():
        for _, sampleseqtmp in sampletmp['samples seq'].iterrows():
            sampleseqtmp = SampleSeq(sampleseqtmp)
            if int(sampleseqtmp.PCR) == 1:
                sampleseq_later = sampleseqtmp
                found = True
                break
        if found:
            break

    adaID_later = sampleseq_later['adapter']
    data_folder_later = sampleseq_later.sequencing_run.folder
    cons_rec_later = SeqIO.read(
        get_consensus_filename(data_folder_later, adaID_later, fragment),
        'fasta')
    conss_later = str(cons_rec_later.seq)

    start = find_seed_imperfect(cons_rec_later, cons_rec[:20])
    end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20

    if VERBOSE >= 1:
        print 'Complementing PCR2 consensus with later PCR1:',
        print sampleseq_later.name, sampleseq_later[
            'seq run'], sampleseq_later.adapter

    frag_spec = sampleseq_later.regions_complete[
        sampleseq_later.regions_generic.index(fragment)]

    return (frag_spec, conss_later[:start] + cons_rec + conss_later[end:])
Esempio n. 4
0
    # Specify the dataset
    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder

    # If the script is called with no adaID, iterate over all
    dataset.discard_nondivided_samples()
    samples = dataset.samples
    if adaIDs is not None:
        samples = samples.loc[samples.adapter.isin(adaIDs)]

    if VERBOSE >= 3:
        print 'adaIDs', samples.adapter

    # Iterate over all requested samples
    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        adaID = sample.adapter

        # If the script is called with no fragment, iterate over all
        if not fragments:
            fragments_sample = sample.regions_complete
        else:
            from re import findall
            fragments_all = sample.regions_complete
            fragments_sample = []
            for fragment in fragments:
                frs = filter(lambda x: fragment in x, fragments_all)
                if len(frs):
                    fragments_sample.append(frs[0])
            if 'genomewide' in fragments:
                fragments_sample.append('genomewide')
        if adaIDs is not None:
            samples = samples.loc[samples.adapter.isin(adaIDs)]
    
        if use_pats:
            samples = samples.loc[samples['patient sample'] != 'nan']
    else:
        samples = samples.loc[samplenames]

    if fragments is None:
        fragments = ['F'+str(i+1) for i in xrange(6)]

    alis = {fr: AlignIO.read(get_consensi_alignment_filename('all', fr), 'fasta')
            for fr in fragments}

    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        data_folder = sample.seqrun_folder
        adaID = sample.adapter
        pname = sample.patientname

        for fragment in fragments:
            if VERBOSE >= 1:
                print sample['seq run'], adaID, fragment, samplename,

            # Read the summary filename of the filter_mapped, and find out whether
            # there are many distant reads (a few are normal)
            fn = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
            if os.path.isfile(fn):
                found = False
                with open(fn, 'r') as f:
                    for line in f:
Esempio n. 6
0
    samples = load_samples_sequenced()
    if pnames is not None:
        samples = samples.loc[samples.patient.isin(pnames)]
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    for samplename, sample in samples.iterrows():
        sample = SamplePat(sample)
        if VERBOSE >= 1:
            print samplename

        dist_hists = []
        samples_seq = sample.get_sequenced_samples()
        samples_seq = samples_seq.loc[samples_seq.PCR == 1]
        for samplename_seq, sample_seq in samples_seq.iterrows():
            sample_seq = SampleSeq(sample_seq)
            data_folder = sample_seq.seqrun_folder
            adaID = sample_seq.adapter

            for fragment in fragments:
                try:
                    dist_hist = get_distance_histogram(data_folder,
                                                       adaID,
                                                       fragment,
                                                       VERBOSE=VERBOSE)
                except IOError:
                    continue
                dist_hists.append((samplename_seq, fragment, dist_hist))

        dist_hists.sort(key=itemgetter(1))
            samplenames_pat = samples_pat.index[ind]
            samples_seq = samples_seq.loc[samples_seq['patient sample'].isin(samplenames_pat)]
        else:
            samples_seq = samples_seq.loc[samples_seq.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples_seq.index.tolist()
        
    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    for samplename, sample in samples_seq.iterrows():
        sample = SampleSeq(sample)

        samplename_pat = sample['patient sample']
        sample_pat = samples_pat.loc[samplename_pat] 
        sample['patient'] = pname = sample_pat.patient
        PCR = int(sample.PCR)
        fragments_sample = sorted(set(sample.regions_generic) & set(fragments))

        if VERBOSE:
            print samplename, samplename_pat, pname, PCR

        if not skip_hash:
            make_output_folders(pname, samplename_pat, PCR=PCR, VERBOSE=VERBOSE)

        for fragment in fragments_sample:
            if VERBOSE:
Esempio n. 8
0
    data_folder = dataset.folder

    samples = dataset.samples

    if not fragments:
        fragments = ['F' + str(i + 1) for i in xrange(6)]

    matrices = {}
    for fragment in fragments:
        if VERBOSE:
            print fragment

        samples_frag = samples.loc[[
            os.path.isfile(
                SampleSeq(s).get_mapped_filename(fragment,
                                                 type='bam',
                                                 filtered=False))
            for sn, s in samples.iterrows()
        ]]
        n_samples = len(samples_frag)

        consensi = [
            SeqIO.read(SampleSeq(s).get_consensus_filename(fragment), 'fasta')
            for sn, s in samples_frag.iterrows()
        ]
        labels = [(sn, s.adapter) for sn, s in samples_frag.iterrows()]
        m = np.zeros((n_samples, n_samples), int)

        for si, (samplename, sample) in enumerate(samples_frag.iterrows()):
            if VERBOSE == 1:
                print samplename,
    patient.discard_nonsequenced_samples()

    mkdirs(get_initial_reference_foldername(pname))

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    if samplename is None:
        sample = SamplePat(patient.samples.iloc[samplen])
    else:
        sample = load_sample_sequenced(samplename)

    for fragment in fragments:
        sample_seq = SampleSeq(sample.samples_seq.iloc[repn])

        seq_run = sample_seq['seq run']
        adaID = sample_seq['adapter']
        dataset = sample_seq.sequencing_run
        data_folder = dataset.folder

        if VERBOSE:
            print 'Initial sample:', sample_seq.name, sample_seq['seq run'],
            print sample_seq.adapter

        cons_rec = SeqIO.read(
            get_consensus_filename(data_folder, adaID, fragment), 'fasta')
        frag_spec = sample_seq.regions_complete[\
                            sample_seq.regions_generic.index(fragment)]
Esempio n. 10
0
    savefig = args.savefig
    only_filt = args.only_filt

    # Specify the dataset
    dataset = load_sequencing_run(seq_run)
    data_folder = dataset['folder']

    # If the script is called with no adaID, iterate over all
    if not adaIDs:
        adaIDs = dataset['adapters']
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # Iterate over all requested samples
    for adaID in adaIDs:

        # If the script is called with no fragment, iterate over all
        sample = SampleSeq(dataset.samples.loc[dataset.samples.adapter == adaID].iloc[0])
        samplename = sample.name
        if not fragments:
            fragments_sample = sample.regions_generic
        else:
            fragments_sample = sorted(set(fragments) & set(sample.regions_generic))

        if not only_filt:
            plot_minor_allele_frequency(data_folder, adaID, fragments_sample,
                                        VERBOSE=VERBOSE, savefig=savefig)
        else:
            plot_minor_allele_frequency_filtered(data_folder, adaID, fragments_sample,
                                                 VERBOSE=VERBOSE, savefig=savefig)