Ejemplo n.º 1
0
def plot_cumulative_histogram(data_folder, adaID, fragment, insert_sizes,
                              title=None,
                              ax=None,
                              show=False, savefig=False,
                              **kwargs):
    '''Plot cumulative histogram of insert sizes'''
    import matplotlib.pyplot as plt
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.plot(insert_sizes, np.linspace(0, 1, len(insert_sizes)), **kwargs)
    ax.set_xlabel('Insert size')
    ax.set_ylabel('Cumulative fraction')
    ax.set_xlim(-1, 1000)
    ax.set_ylim(-0.02, 1.02)
    if title is not None:
        ax.set_title(title)

    plt.tight_layout()

    if show:
        plt.ion()
        plt.show()

    if savefig:
        output_filename = get_insert_size_distribution_cumulative_filename(data_folder,
                                                                           adaID,
                                                                           fragment)
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder
        mkdirs(get_figure_folder(data_folder, adaID))
        fig.savefig(output_filename)
Ejemplo n.º 2
0
def make_output_folders(data_folder, adaID, VERBOSE=0):
    '''Make output folders for the script'''
    from hivwholeseq.utils.generic import mkdirs
    dirname = data_folder + foldername_adapter(adaID) + 'map_iter/'
    mkdirs(dirname)
    if VERBOSE:
        print 'Folder created:', dirname
Ejemplo n.º 3
0
def make_output_folders(data_folder,
                        adapters_designed,
                        VERBOSE=0,
                        summary=True):
    '''Make output folders for all adapters and unclassified (e.g. PhiX)'''
    from hivwholeseq.utils.generic import mkdirs

    # Make folders for the samples
    for (adaID, s) in adapters_designed:
        dirname = foldername_adapter(adaID)
        mkdirs(data_folder + dirname)
        if VERBOSE:
            print 'Folder created:', dirname

    # Make a default directory for unclassified reads
    mkdirs(data_folder + 'unclassified_reads')
    if VERBOSE:
        print 'Folder created: unclassified reads'

    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'a') as f:
            f.write('\n')
            f.write(
                'Folders created for samples and unclassified reads (including phix).'
            )
            f.write('\n')
Ejemplo n.º 4
0
def plot_histogram(data_folder, adaID, fragment, h,
                   title=None,
                   ax=None,
                   show=False, savefig=False,
                   **kwargs):
    '''Plot histogram of insert sizes'''
    import matplotlib.pyplot as plt
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    x = 0.5 * (h[1][1:] + h[1][:-1])
    y = h[0]
    ax.plot(x, y, **kwargs)
    ax.set_xlabel('Insert size')
    ax.set_ylabel('Density')

    plt.tight_layout()

    if show:
        plt.ion()
        plt.show()

    if savefig:
        output_filename = get_insert_size_distribution_filename(data_folder, adaID,
                                                                fragment)

        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder
        mkdirs(get_figure_folder(data_folder, adaID))
        plt.savefig(output_filename)
def make_output_folders(data_folder, adaID, VERBOSE=0):
    '''Make output folders for the script'''
    from hivwholeseq.utils.generic import mkdirs
    dirname = data_folder+foldername_adapter(adaID)+'map_iter/'
    mkdirs(dirname)
    if VERBOSE:
        print 'Folder created:', dirname
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    """Plot the results of the quality scores along reads"""

    import matplotlib.pyplot as plt
    from matplotlib import cm

    fig, axs = plt.subplots(1, 2, figsize=(16, 9))
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qpos in enumerate(qual):
            x = qpos
            y = np.linspace(0, 1, len(x))[::-1]
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))), alpha=0.5, lw=2)
        ax.set_xlabel("Phred quality", fontsize=14)
        ax.set_ylabel("Fraction of bases above quality x", fontsize=14)
        ax.set_title("Read" + str(i + 1), fontsize=16)
        ax.text(2, 0.03, "blue to red: 0 to " + str(len(qual)) + " base", fontsize=18)

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename

        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    '''Plot the results of the quality scores along reads'''

    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(16, 9))
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qpos in enumerate(qual):
            x = qpos
            y = np.linspace(0, 1, len(x))[::-1]
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))),
                    alpha=0.5,
                    lw=2)
        ax.set_xlabel('Phred quality', fontsize=14)
        ax.set_ylabel('Fraction of bases above quality x', fontsize=14)
        ax.set_title('Read'+str(i+1), fontsize=16)
        ax.text(2, 0.03, 'blue to red: 0 to '+str(len(qual))+' base', fontsize=18)

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, \
                get_quality_along_reads_filename
        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Ejemplo n.º 8
0
def make_output_folders(data_folder, adaID, VERBOSE=0):
    '''Make output folders'''
    from hivwholeseq.utils.generic import mkdirs
    output_filename = get_divided_filenames(data_folder, adaID, fragments=['F1'])[0]
    dirname = os.path.dirname(output_filename)
    mkdirs(dirname)
    if VERBOSE:
        print 'Folder created:', dirname
Ejemplo n.º 9
0
def copy_folder(patient, dst_fn, foldername):
    '''Copy a whole folder'''
    src_fn = patient.folder+foldername+os.sep
    map_fn = dst_fn+foldername+os.sep

    mkdirs(map_fn)
    for fn_src in src_fn.listdir():
        copy(src_fn+fn_src, map_fn+fn_src)
Ejemplo n.º 10
0
def plot_cuts_quality_along_reads(data_folder,
                                  adaID,
                                  quality,
                                  title='',
                                  VERBOSE=0,
                                  savefig=False):
    '''Plot some cuts of the quality along the read'''
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array(
                [100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x,
                    y,
                    color=cm.jet(int(255.0 * j / len(qthreshs))),
                    alpha=0.8,
                    lw=2,
                    label='Q = ' + str(qthresh))
        ax.set_xlabel('Position [bp]', fontsize=14)
        ax.set_ylabel('Percentage of bases above quality x', fontsize=14)
        ax.set_title('Read' + str(i + 1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc='best')

    if title:
        fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        if savefig == True:
            from hivwholeseq.sequencing.filenames import get_figure_folder, \
                    get_quality_along_reads_filename
            fig_folder = get_figure_folder(data_folder, adaID)
            fig_filename = get_quality_along_reads_filename(data_folder,
                                                            adaID,
                                                            simple=True)
        elif isinstance(savefig, basestring):
            import os
            fig_filename = savefig
            fig_folder = os.path.dirname(fig_filename)

        else:
            raise ValueError(
                'savefig must be a bool or a figure filename (string)')

        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Ejemplo n.º 11
0
def make_output_folders(data_folder, adaID, VERBOSE=0, summary=True):
    '''Make output folders'''
    from hivwholeseq.utils.generic import mkdirs
    outfiles = [get_premapped_filename(data_folder, adaID)]
    if summary:
        outfiles.append(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    for outfile in outfiles:
        dirname = os.path.dirname(outfile)
        mkdirs(dirname)
        if VERBOSE:
            print 'Folder created:', dirname
Ejemplo n.º 12
0
def make_output_folders(data_folder, adaIDs, VERBOSE=0):
    '''Make output folders for symlinking'''
    from hivwholeseq.utils.generic import mkdirs
    mkdirs(data_folder)
    if VERBOSE >= 1:
        print 'Folder created:', data_folder

    for adaID in adaIDs + [-1]:
        mkdirs(data_folder+foldername_adapter(adaID))
        if VERBOSE >= 1:
            print 'Folder created:', data_folder+foldername_adapter(adaID)
Ejemplo n.º 13
0
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
Ejemplo n.º 14
0
def make_output_folders(data_folder, adaID, VERBOSE=0, summary=True):
    '''Make output folders'''
    from hivwholeseq.utils.generic import mkdirs
    outfiles = [get_premapped_filename(data_folder, adaID)]
    if summary:
        outfiles.append(
            get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    for outfile in outfiles:
        dirname = os.path.dirname(outfile)
        mkdirs(dirname)
        if VERBOSE:
            print 'Folder created:', dirname
Ejemplo n.º 15
0
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
Ejemplo n.º 16
0
def copy_initial_reference(patient, dst_fn):
    '''Copy initial patient mapping reference'''
    ref_fn = dst_fn+'reference/'
    mkdirs(ref_fn)

    for fragment in fragments:
        fn_src = patient.get_reference_filename(fragment)
        fn_dst = ref_fn+os.path.basename(fn_src)
        shutil.copy(fn_src, fn_dst)

    fn_src = patient.get_reference_filename('genomewide', format='gb')
    fn_dst = ref_fn+os.path.basename(fn_src)
    shutil.copy(fn_src, fn_dst)
Ejemplo n.º 17
0
def make_output_folders(pname, samplename, PCR=1, VERBOSE=0):
    '''Make the output folders if necessary for hash and map'''
    hash_foldername = os.path.dirname(get_initial_hash_filename(pname, 'F0'))
    map_foldername = get_mapped_to_initial_foldername(pname, samplename, PCR=PCR)

    if not os.path.isdir(hash_foldername):
        mkdirs(hash_foldername)
        if VERBOSE:
            print 'Folder created:', hash_foldername

    mkdirs(map_foldername)
    if VERBOSE:
        print 'Folder created:', map_foldername
def plot_cuts_quality_along_reads(data_folder, adaID, quality, title="", VERBOSE=0, savefig=False):
    """Plot some cuts of the quality along the read"""
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm

    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))), alpha=0.8, lw=2, label="Q = " + str(qthresh))
        ax.set_xlabel("Position [bp]", fontsize=14)
        ax.set_ylabel("Percentage of bases above quality x", fontsize=14)
        ax.set_title("Read" + str(i + 1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc="best")

    if title:
        fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs

        if savefig == True:
            from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename

            fig_folder = get_figure_folder(data_folder, adaID)
            fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True)
        elif isinstance(savefig, basestring):
            import os

            fig_filename = savefig
            fig_folder = os.path.dirname(fig_filename)

        else:
            raise ValueError("savefig must be a bool or a figure filename (string)")

        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Ejemplo n.º 19
0
def make_output_folders(data_folder, adapters_designed, VERBOSE=0, summary=True):
    '''Make output folders for all adapters and unclassified (e.g. PhiX)'''
    from hivwholeseq.utils.generic import mkdirs

    # Make folders for the samples
    for (adaID, s) in adapters_designed:
            dirname = foldername_adapter(adaID)
            mkdirs(data_folder+dirname)
            if VERBOSE:
                print 'Folder created:', dirname

    # Make a default directory for unclassified reads
    mkdirs(data_folder+'unclassified_reads')
    if VERBOSE:
        print 'Folder created: unclassified reads'

    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'a') as f:
            f.write('\n')
            f.write('Folders created for samples and unclassified reads (including phix).')
            f.write('\n')
Ejemplo n.º 20
0
def predict_RNA_structure(seq, label='seq', maxstructs=1, VERBOSE=0):
    '''Predict RNA secondary structures using RNAstructure'''
    import os
    import subprocess as sp
    from hivwholeseq.utils.generic import mkdirs
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    rna_fold_bin = '/ebio/ag-neher/home/fzanini/programs/RNAstructure_cli/exe/Fold'

    # Make tmp input file
    tmp_file_in = '/ebio/ag-neher/home/fzanini/tmp/RNAfold/'+label+'.fasta'
    tmp_file_out = '/ebio/ag-neher/home/fzanini/tmp/RNAfold/'+label+'.ct'
    mkdirs(os.path.dirname(tmp_file_in))
    seqrec = SeqRecord(Seq(seq, ambiguous_dna),
                       id=label,
                       name=label,
                       description='')
    SeqIO.write(seqrec, tmp_file_in, 'fasta')

    # Call RNAStructure with all the crap (env vars, etc)
    rna_tables = '/ebio/ag-neher/home/fzanini/programs/RNAstructure_cli/data_tables/'
    env = os.environ.copy()
    env['DATAPATH'] = rna_tables
    call_list = [rna_fold_bin, '-m', str(maxstructs), tmp_file_in, tmp_file_out]
    if VERBOSE >= 2:
        print ' '.join(call_list)
    output = sp.check_output(call_list, shell=False)
    if VERBOSE >= 3:
        print output

    if 'Writing output ct file...done.' in output:
        structs = parse_ct_file_multiple(tmp_file_out)
    else:
        IOError('RNAstructure had problems predicting the structure')

    return structs
def plot_cuts_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    '''Plot some cuts of the quality along the read'''
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))),
                    alpha=0.8,
                    lw=2,
                    label='Q = '+str(qthresh))
        ax.set_xlabel('Position [bp]', fontsize=14)
        ax.set_ylabel('Percentage of bases above quality x', fontsize=14)
        ax.set_title('Read'+str(i+1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc='best')

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, \
                get_quality_along_reads_filename
        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Ejemplo n.º 22
0
        '--repnumber',
        type=int,
        default=0,
        help='Index of the sequenced sample within that patient sample')

    args = parser.parse_args()
    pname = args.patient
    fragments = args.fragments
    VERBOSE = args.verbose
    repn = args.repnumber
    samplename = args.sample

    patient = load_patient(pname)
    patient.discard_nonsequenced_samples()

    mkdirs(get_initial_reference_foldername(pname))

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    if samplename is None:
        sample = SamplePat(patient.samples.iloc[samplen])
    else:
        sample = load_sample_sequenced(samplename)

    for fragment in fragments:
        sample_seq = SampleSeq(sample.samples_seq.iloc[repn])

        seq_run = sample_seq['seq run']
    parser.add_argument('--sample',
                        help='Use a specific sample (not the first time point) for the reference')
    parser.add_argument('--repnumber', type=int, default=0,
                        help='Index of the sequenced sample within that patient sample')

    args = parser.parse_args()
    pname = args.patient
    fragments = args.fragments
    VERBOSE = args.verbose
    repn = args.repnumber
    samplename = args.sample

    patient = load_patient(pname)
    patient.discard_nonsequenced_samples()

    mkdirs(get_initial_reference_foldername(pname))

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments
    
    if samplename is None:
        sample = SamplePat(patient.samples.iloc[samplen])
    else:
        sample = load_sample_sequenced(samplename)

    for fragment in fragments:
        sample_seq = SampleSeq(sample.samples_seq.iloc[repn])

        seq_run = sample_seq['seq run']
Ejemplo n.º 24
0

# Script
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="make figure")
    parser.add_argument('--redo', action='store_true', help='recalculate data')
    params = parser.parse_args()

    fragment = 'F1'
    VERBOSE = 2
    username = os.path.split(os.getenv('HOME'))[-1]

    foldername = get_figure_folder(username, 'first')
    fn_data = foldername+'data/'
    mkdirs(fn_data)
    fn_data = fn_data + 'minor_alleles_example.pickle'

    if not os.path.isfile(fn_data) or params.redo:
        samplename = 'NL4-3'
        sample = lss(samplename)
        counts = sample.get_allele_counts(fragment, merge_read_types=True)
        data = compress_data(counts, samplename, fragment)

        samplename = '27134'
        sample = lssp(samplename)
        counts = sample.get_allele_counts(fragment, merge_read_types=True)
        data = compress_data(counts, samplename, fragment, data=data)


        store_data(data, fn_data)
import sys
import os

from hivwholeseq.utils.generic import mkdirs
from hivwholeseq.patients.samples import itersample
from hivwholeseq.sequencing.samples import load_samples_sequenced as lss
from hivwholeseq.patients.samples import load_samples_sequenced as lssp
from hivwholeseq.sequencing.filenames import get_sample_foldername



# Script
if __name__ == '__main__':

    samples_pat = lssp()
    samples_seq = lss()

    for samplename, sample in itersample(samples_pat):
        root_foldername = sample.get_foldername()+'samples_sequencing/'
        mkdirs(root_foldername)

        for samplenameseq, sampleseq in samples_seq.iterrows():
            if sampleseq['patient sample'] == samplename:
                src_folder = get_sample_foldername(samplenameseq)
                dst_folder = root_foldername+samplenameseq
                if not os.path.islink(dst_folder):
                    os.symlink(src_folder, dst_folder)
                    print 'Symlink:', src_folder, dst_folder
                else:
                    print 'Esists:', dst_folder
Ejemplo n.º 26
0
                    dist_hist = get_distance_histogram(data_folder,
                                                       adaID,
                                                       fragment,
                                                       VERBOSE=VERBOSE)
                except IOError:
                    continue
                dist_hists.append((samplename_seq, fragment, dist_hist))

        dist_hists.sort(key=itemgetter(1))

        fig, ax = plt.subplots()
        for i, (samplename_seq, fragment, h) in enumerate(dist_hists):
            plot_distance_histogram(h,
                                    ax=ax,
                                    color=cm.jet(1.0 * i / len(dist_hists)),
                                    label=', '.join([samplename_seq,
                                                     fragment]))
        ax.set_title(samplename)
        ax.legend(loc=1, fontsize=10)

        if use_save:
            foldername = sample.get_foldername() + 'figures/'
            mkdirs(foldername)
            fn = foldername + 'distance_to_consensus_seqsamples.png'
            fig.savefig(fn)
            plt.close(fig)

    if not use_save:
        plt.ion()
        plt.show()
Ejemplo n.º 27
0
            if not len(datum['ind']):
                win_start += gap
                continue

            datum['times'] = patient.times[datum['ind']]
            datum['pcode'] = patient.code
            datum['window'] = (win_start, win_end)
            data.append(datum)

            if use_save:
                if VERBOSE >= 2:
                    print 'Save to file'

                rname = 'scan_' + str(win_start) + '-' + str(win_end)
                fn_out = patient.get_haplotype_count_trajectory_filename(rname)
                mkdirs(os.path.dirname(fn_out))
                np.savez_compressed(
                    fn_out,
                    hct=datum['hct'],
                    ind=datum['ind'],
                    times=datum['times'],
                    seqs=datum['seqs'],
                    ali=datum['alim'],
                )

            if VERBOSE >= 2:
                print 'Build tree'
            times = datum['times']
            alim = datum['alim']
            hct = datum['hct']
            hft = 1.0 * hct / hct.sum(axis=0)
            adaID = sample_seq.adapter

            for fragment in fragments:
                try:
                    dist_hist = get_distance_histogram(data_folder, adaID, fragment,
                                                       VERBOSE=VERBOSE)
                except IOError:
                    continue
                dist_hists.append((samplename_seq, fragment, dist_hist))

        dist_hists.sort(key=itemgetter(1))

        fig, ax = plt.subplots()
        for i, (samplename_seq, fragment, h) in enumerate(dist_hists):
            plot_distance_histogram(h, ax=ax,
                                    color=cm.jet(1.0 * i / len(dist_hists)),
                                    label=', '.join([samplename_seq, fragment]))
        ax.set_title(samplename)
        ax.legend(loc=1, fontsize=10)

        if use_save:
            foldername = sample.get_foldername()+'figures/'
            mkdirs(foldername)
            fn = foldername+'distance_to_consensus_seqsamples.png'
            fig.savefig(fn)
            plt.close(fig)

    if not use_save:
        plt.ion()
        plt.show()
Ejemplo n.º 29
0
    parser = argparse.ArgumentParser(description='Copy data folder',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)    
    parser.add_argument('destination',
                        help='Destination folder')
    parser.add_argument('--strip-PCR1', action='store_true',
                        help='Strip the ../PCR1/ part of the file tree')

    args = parser.parse_args()
    dst_fn = args.destination.lstrip(os.sep)+os.sep
    stirp_PCR1 = args.strip_PCR1

    patients_fn = dst_fn+'patients/'
    ref_fn = dst_fn+'reference/'

    print 'Make root folders'
    mkdirs(patients_fn)
    mkdirs(ref_fn)

    print 'Reference sequences'
    copy_reference(ref_fn)

    
    patients = load_patients()
    for pname, patient in patients.iterrows():
        print pname
        patient = Patient(patient)

        print 'Make folder'
        pat_fn = patients_fn+pname+os.sep
        mkdirs(pat_fn)
            if not len(datum['ind']):
                win_start += gap
                continue

            datum['times'] = patient.times[datum['ind']]
            datum['pcode'] = patient.code
            datum['window'] = (win_start, win_end)
            data.append(datum)

            if use_save:
                if VERBOSE >= 2:
                    print 'Save to file'
                
                rname = 'scan_'+str(win_start)+'-'+str(win_end)
                fn_out = patient.get_haplotype_count_trajectory_filename(rname)
                mkdirs(os.path.dirname(fn_out))
                np.savez_compressed(fn_out,
                                    hct=datum['hct'],
                                    ind=datum['ind'],
                                    times=datum['times'],
                                    seqs=datum['seqs'],
                                    ali=datum['alim'],
                                   )

            if VERBOSE >= 2:
                print 'Build tree'
            times = datum['times']
            alim = datum['alim']
            hct = datum['hct']
            hft = 1.0 * hct / hct.sum(axis=0)
            ali = expand_annotate_alignment(alim, hft, hct, times,
Ejemplo n.º 31
0

# Script
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="make figure")
    parser.add_argument('--redo', action='store_true', help='recalculate data')
    params = parser.parse_args()

    fragment = 'F1'
    VERBOSE = 2
    username = os.path.split(os.getenv('HOME'))[-1]

    foldername = get_figure_folder(username, 'first')
    fn_data = foldername + 'data/'
    mkdirs(fn_data)
    fn_data = fn_data + 'minor_alleles_example.pickle'

    if not os.path.isfile(fn_data) or params.redo:
        samplename = 'NL4-3'
        sample = lss(samplename)
        counts = sample.get_allele_counts(fragment, merge_read_types=True)
        data = compress_data(counts, samplename, fragment)

        samplename = '27134'
        sample = lssp(samplename)
        counts = sample.get_allele_counts(fragment, merge_read_types=True)
        data = compress_data(counts, samplename, fragment, data=data)

        store_data(data, fn_data)
    else: