コード例 #1
0
def loadK31(reg, filepath, fromHIV=False):
    '''
    Loading data for 31 additional patients
    
    Input arguments:
    reg: name of genetic region (gag or pol)
    filepath: path to directory where the frequency data are to be stored/downloaded
    fromHIV: download raw data and store them, if True; use stored data, if False 
    '''
    data = {}
    if fromHIV:
        sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq")
        from hivwholeseq.patients.patients import load_patients, Patient
        pats = load_patients(csv=True)
        fmt = "%d/%m/%Y"
        fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w')
        for pcode, pat in pats.iterrows():
            try:
                EDI = datetime.strptime(pat["infect date best"], fmt)
                P = Patient(pat)
                aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0]
                for si, (scode, sample) in enumerate(P.samples.iterrows()):
                    try:
                        date = datetime.strptime(sample["date"], fmt)
                        af = aft[si]
                        TI = date.toordinal() - EDI.toordinal()
                        fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI))
                        np.save(
                            filepath +
                            '{}_{}_{}_data.npy'.format(pcode, scode, reg),
                            af.data)
                        np.save(
                            filepath +
                            '{}_{}_{}_mask.npy'.format(pcode, scode, reg),
                            af.mask)
                        data['{}_{}'.format(pcode,
                                            scode)] = (date.toordinal() -
                                                       EDI.toordinal(), af)
                        print(pcode, scode, "WORKED!!!")
                    except:
                        print(scode, "didn't work")

            except:
                print("skipping patient ", pcode)
        fhandle.close()
    else:
        with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle:
            for line in fhandle:
                words = line.split()
                pat_name = '_'.join(words[:2])
                af_data = np.load(filepath +
                                  '{}_{}_data.npy'.format(pat_name, reg))
                af_mask = np.load(filepath +
                                  '{}_{}_mask.npy'.format(pat_name, reg))
                af = np.ma.masked_array(af_data, mask=af_mask)
                data[pat_name] = (int(words[2]), af)
    return data
コード例 #2
0
def get_template_numbers(patients, VERBOSE=0):
    '''Collect template numbers from all patient samples'''
    data = []
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE:
            print pname, patient.code

        samples = patient.samples
        n_approx = samples['templates approx']
        dils = [get_dilution(x) for x in samples['dilutions']]
        n_dils = [2 * estimate_ntemplates_Poisson(x) for x in dils]

        # Attach sample date info
        age = np.array((datetime.datetime.now() - samples.date)) / 86400e9

        data.append({
            'n_approx': n_approx,
            'n_dil': n_dils,
            'age': age,
            'pname': patient.code
        })

    return data
コード例 #3
0
def get_divergence_diversity_sliding(aft, block_length, VERBOSE=0):
    '''Get local divergence and diversity in a sliding window'''
    cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True)
    ind_N = cons_ind == 5
    cons_ind[ind_N] = 0
    aft_nonanc = 1.0 - aft[:, cons_ind, np.arange(aft.shape[2])]
    aft_nonanc[:, ind_N] = 0

    aft_var = (aft * (1 - aft)).sum(axis=1)

    struct = np.ones(block_length)

    dg = np.ma.array(
        np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'),
                            axis=1,
                            arr=aft_nonanc),
        hard_mask=True)
    ds = np.ma.array(np.apply_along_axis(
        lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=aft_var),
                     hard_mask=True)

    # NOTE: normalization happens based on actual coverage
    norm = np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'),
                               axis=1,
                               arr=(-aft[:, 0].mask))

    dg.mask = norm < block_length
    dg /= norm

    ds.mask = norm < block_length
    ds /= norm

    x = np.arange(dg.shape[1]) + (block_length - 1) / 2.0

    return (x, dg, ds)
コード例 #4
0
def get_divergence_diversity_sliding(aft, block_length, VERBOSE=0):
    '''Get local divergence and diversity in a sliding window'''
    cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True)
    ind_N = cons_ind == 5
    cons_ind[ind_N] = 0
    aft_nonanc = 1.0 - aft[:, cons_ind, np.arange(aft.shape[2])]
    aft_nonanc[:, ind_N] = 0

    aft_var = (aft * (1 - aft)).sum(axis=1)

    struct = np.ones(block_length)

    dg = np.ma.array(np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'),
                                         axis=1, arr=aft_nonanc), hard_mask=True)
    ds = np.ma.array(np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'),
                                         axis=1, arr=aft_var), hard_mask=True)

    # NOTE: normalization happens based on actual coverage
    norm = np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'),
                               axis=1, arr=(-aft[:, 0].mask))

    dg.mask = norm < block_length
    dg /= norm

    ds.mask = norm < block_length
    ds /= norm

    x = np.arange(dg.shape[1]) + (block_length - 1) / 2.0

    return (x, dg, ds)
コード例 #5
0
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    save_to_file = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]
    if VERBOSE >= 3:
        print 'patients', patients.index
    if not len(patients):
        raise ValueError('No patients found!')

    maps_coord = defaultdict(dict)
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'),
                             patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            refseq = load_custom_reference(refname, format='gb', region=region)
            patseq = patient.get_reference(region)
コード例 #6
0
    for pname, patient in patients.iterrows():
        if VERBOSE >= 1:
            print patient.code, start, end

        if submit:
            fork_self(patient.code,
                      width,
                      gap,
                      start,
                      end,
                      VERBOSE=VERBOSE,
                      freqmin=freqmin,
                      countmin=countmin)
            continue

        patient = Patient(patient)
        ref = patient.get_reference('genomewide')
        L = len(ref)

        win_start = start
        while win_start + width - gap < min(L, end):
            win_end = min(win_start + width, end, L)

            if VERBOSE >= 1:
                print patient.code, win_start, win_end

            if VERBOSE >= 2:
                print 'Get region haplotypes'
            try:
                datum = patient.get_local_haplotype_count_trajectories(\
                               'genomewide',
コード例 #7
0
    af_bd = [0.05, 0.95]

    data = {}

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        print pname
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        t_bds = []
        t_loss = []
        t_fixs = []
        n_staypolys = []
        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Collect allele counts from patient samples, and return only positive hits
            # sns contains sample names and PCR types
            (aft, ind) = patient.get_allele_frequency_trajectories(
                fragment,
                cov_min=cov_min,
コード例 #8
0
    parser.add_argument('--plot', action='store_true',
                        help='Plot phylogenetic tree. Requires --save and --tree')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    use_save = args.save
    use_plot = args.plot

    patients = load_patients()
    if pnames != ['all']:
        patients = patients.iloc[patients.index.isin(pnames)]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        if regions is None:
            refseq_gw = patient.get_reference('genomewide', 'gb')
            regionspat = map(attrgetter('id'), refseq_gw.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region
                if VERBOSE == 1:
                    print ''

コード例 #9
0
                        help='Patient to analyze')
    parser.add_argument('--force', action='store_true',
                        help='Do not stop for errors')

    args = parser.parse_args()
    VERBOSE = args.verbose
    pnames = args.patients
    use_force = args.force

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]


    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print 'Patient:', patient.name

        ref = patient.get_reference('genomewide', 'gb')

        for fea in ref.features:
            if fea.type == 'protein':
                if VERBOSE >= 2:
                    print 'Checking', fea.id
                try:
                    check_protein(fea, ref, VERBOSE=VERBOSE)
                except ValueError:
                    if use_force:
                        print 'ERROR!'
コード例 #10
0
ファイル: get_PCA.py プロジェクト: iosonofabio/hivwholeseq
    parser.add_argument('--plot', action='store_true',
                        help='Plot local haplotype trajectories')

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():

        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        if VERBOSE >= 1:
            print patient.name, roi
    
        if VERBOSE >= 2:
            print 'Get haplotype trajectories'
        try:
            (ht, indt, htseqs) = patient.get_region_count_trajectories(roi[0],
                                                                    VERBOSE=VERBOSE)
        except IOError:
            (ht, indt, htseqs) = patient.get_local_haplotype_count_trajectories(roi,
                                                                    VERBOSE=VERBOSE)
    
        if VERBOSE >= 2:
コード例 #11
0
                        help='Minimal frequency to keep the haplotype')

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    maxreads = args.maxreads
    use_plot = args.plot
    freqmin = args.freqmin

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print pname
    
        if os.path.isfile(patient.get_local_tree_filename(roi[0], format='json')):
            if VERBOSE >= 2:
                print 'Get tree'
            region = roi[0]
            tree = patient.get_local_tree(region)

        elif os.path.isfile(patient.get_local_tree_filename(' '.join(map(str, roi)), format='json')):
            if VERBOSE >= 2:
                print 'Get tree'
            region = ' '.join(map(str, roi))
            tree = patient.get_local_tree(region)
コード例 #12
0
        '--force',
        action='store_true',
        help='Go ahead even if annotations differ from existing sequence')

    args = parser.parse_args()
    VERBOSE = args.verbose
    pnames = args.patients
    use_save = args.save
    use_force = args.force

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[patients.index.isin(pnames)]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE:
            print 'Patient:', patient.name

        fn = patient.get_reference_filename('genomewide')
        refseq = SeqIO.read(fn, 'fasta', alphabet=ambiguous_dna)

        fragment_edges = get_edges_fragments(patient, VERBOSE=VERBOSE)
        annotate_sequence(refseq,
                          VERBOSE=VERBOSE,
                          additional_edges={'fragment': fragment_edges})

        if VERBOSE >= 1:
            for feature in refseq.features:
                if feature.id[0] == 'F':
コード例 #13
0
    af_bd = [0.05, 0.95]

    data = {}

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        print pname
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        t_bds = []
        t_loss = []
        t_fixs = []
        n_staypolys = []
        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Collect allele counts from patient samples, and return only positive hits
            # sns contains sample names and PCR types
            (aft, ind) = patient.get_allele_frequency_trajectories(fragment,
                                                               cov_min=cov_min,
                                                               depth_min=depth_min,
コード例 #14
0
    n_binsx = 8
    binsy = [0.,
             0.002,
             0.005, 0.009, 0.013, 0.025,
             0.04136464,  0.08089993,  0.12077255,
             0.16115779,  0.2022444 ,  0.24424043,  0.28738044,  0.33193475,
             0.37822187,  0.42662549,  0.4776187 ,  0.53179937,  0.58994409,
             0.65309361,  0.72269518,  0.80085467,  0.89081905,
             0.95, 0.975, 0.987, 0.991, 0.994,
             0.998,
             1.]
    pp = Propagator(n_binsx, binsy=binsy, use_logit=use_logit)
    hist = pp.histogram

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        samplenames = patient.samples.index

        if not fragments:
            fragments = ['F'+str(i) for i in xrange(1, 7)]
        if VERBOSE >= 2:
            print 'fragments', fragments
    
        # Iterate over samples and fragments
        for fragment in fragments:
            if VERBOSE >= 1:
                print pname, fragment
    
            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 cov_min=depth_min)
コード例 #15
0
    fragments = args.fragments
    VERBOSE = args.verbose
    plot = args.plot
    use_PCR1 = args.PCR1

    patients = load_patients()
    if pnames != ['all']:
        patients = patients.iloc[patients.index.isin(pnames)]

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for fragment in fragments:
            if VERBOSE >= 1:
                print pname, fragment

            covt, ind = patient.get_coverage_trajectories(fragment,
                                                          use_PCR1=use_PCR1)
            samples = patient.samples.iloc[ind]
            times = patient.times[ind]
            ntemplates = samples['n templates']

            if plot is not None:
                import matplotlib.pyplot as plt
                
コード例 #16
0
                        action='store_true',
                        help='Add mouse events to the plot')

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot
    use_interactive = args.interactive

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        (fragment, start, end) = patient.get_fragmented_roi(roi,
                                                            VERBOSE=VERBOSE)
        aft, ind = patient.get_allele_frequency_trajectories(fragment)
        aft = aft[:, :, start:end]

        # TODO: also calculate the logos

        ## Get only some time points
        #i = np.arange(len(ind))[::len(ind) // 2]
        #aft = aft[i]
        #ind = ind[i]

        times = patient.times[ind]
コード例 #17
0
    VERBOSE = args.verbose
    pnames = args.patients
    use_force = args.force
    fragments = args.fragments

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print 'Patient:', patient.name

        patient.discard_nonsequenced_samples()

        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Check whether a reference exists at all
            ref_fn = patient.get_reference_filename(fragment)
            if not os.path.isfile(ref_fn):
                print 'ERROR: reference for fragment', fragment, 'not found!'
                continue
コード例 #18
0
ファイル: get_SFS.py プロジェクト: iosonofabio/hivwholeseq
    bins = np.exp(tbins)/(1+np.exp(tbins))
    binsc = np.sqrt(bins[1:] * bins[:-1])
    binw = np.diff(bins)
    hist = np.zeros_like(binsc)

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        for fragment in fragments:
            if VERBOSE >= 1:
                print patient.name, fragment

            if VERBOSE >= 2:
                print 'Get initial allele frequencies'
            af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min)

            if VERBOSE >= 2:
                print 'Get allele frequencies'
            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 depth_min=depth_min)

            if VERBOSE >= 2:
コード例 #19
0
                        help='Regions to analyze (e.g. V3 F6)')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-4]')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    use_plot = args.plot

    patients = load_patients()
    if pnames:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for region in regions:
            if VERBOSE >= 1:
                print pname, region

            ali = patient.get_consensi_alignment(region)
            tree = patient.get_consensi_tree(region, format='json')

            if use_plot:
                fig, ax = plt.subplots(figsize=(15, 12))
                Phylo.draw(tree, do_show=False, axes=ax)
                ax.set_title(pname+', '+region)

                x_max = max(tree.depths().itervalues())
コード例 #20
0
        patients = patients.loc[pnames]
        # FIXME: the initial ref of p7 is mislabelled and a mess
    else:
        patients = patients.loc[patients.code != "p7"]

    # Prepare output structures
    n_binsx = 5
    binsy = [0.0, 0.002, 0.01, 0.025, 0.12, 0.33, 0.67, 0.88, 0.975, 0.99, 0.998, 1.0]
    props = {
        (gene, synkey): Propagator(n_binsx, binsy=binsy, use_logit=use_logit)
        for gene in genes
        for synkey in ("syn", "nonsyn")
    }

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        samplenames = patient.samples.index

        refseq = patient.get_reference("genomewide", format="gb")

        for gene in genes:

            if VERBOSE >= 1:
                print pname, gene,

            # Get the right fragment(s)
            # FIXME: do better than this ;-)
            frags = {"pol": ["F2", "F3"], "gag": ["F1"], "env": ["F5", "F6"]}
            fragments = frags[gene]

            if VERBOSE >= 1:
コード例 #21
0
def get_divergence(aft):
    '''Get divergence from allele frequency trajectories'''
    cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True)
    dg = 1 - aft[:, cons_ind, np.arange(aft.shape[2])].mean(axis=1)
    return dg
コード例 #22
0
ファイル: copy_data.py プロジェクト: 5l1v3r1/hivwholeseq
    patients_fn = dst_fn+'patients/'
    ref_fn = dst_fn+'reference/'

    print 'Make root folders'
    mkdirs(patients_fn)
    mkdirs(ref_fn)

    print 'Reference sequences'
    copy_reference(ref_fn)

    
    patients = load_patients()
    for pname, patient in patients.iterrows():
        print pname
        patient = Patient(patient)

        print 'Make folder'
        pat_fn = patients_fn+pname+os.sep
        mkdirs(pat_fn)

        print 'Mapping reference'
        copy_initial_reference(patient, pat_fn)

        
        print 'Coordinate maps'
        copy_folder(patient, pat_fn, 'coordinate_maps')


        print 'Alignments'
        copy_folder(patient, pat_fn, 'alignments')
コード例 #23
0
                        action='store_true',
                        help='Plot local haplotype trajectories')

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():

        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        if VERBOSE >= 1:
            print patient.name, roi

        if VERBOSE >= 2:
            print 'Get haplotype trajectories'
        try:
            (ht, indt,
             htseqs) = patient.get_region_count_trajectories(roi[0],
                                                             VERBOSE=VERBOSE)
        except IOError:
            (ht, indt,
             htseqs) = patient.get_local_haplotype_count_trajectories(
                 roi, VERBOSE=VERBOSE)
コード例 #24
0
    args = parser.parse_args()
    VERBOSE = args.verbose
    fragments = args.fragments

    patients = load_patients()

    for fragment in fragments:
        if VERBOSE >= 1:
            print fragment

        refs = []
        for pname, patient in patients.iterrows():
            if VERBOSE >= 2:
                print pname
            patient = Patient(patient)
            refs.append(patient.get_reference(fragment))

        ali = align_muscle(*refs, sort=True)

        # Check whether all references are complete (using the longest primers)
        if VERBOSE >= 2:
            print 'Check alignment'
        alim = np.array(ali)
        if (alim[:, :4] == '-').any():
            raise ValueError('Gaps at the beginning of fragment found')
        elif (alim[:, -4:] == '-').any():
            raise ValueError('Gaps at the end of fragment found')

        if VERBOSE >= 2:
            print 'Save to file'
コード例 #25
0
    VERBOSE = args.verbose
    pnames = args.patients
    use_force = args.force
    fragments = args.fragments

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print 'Patient:', patient.name

        patient.discard_nonsequenced_samples()

        for fragment in fragments:
            if VERBOSE >= 1:
                print fragment

            # Check whether a reference exists at all
            ref_fn = patient.get_reference_filename(fragment)
            if not os.path.isfile(ref_fn):
                print 'ERROR: reference for fragment', fragment, 'not found!'
                continue
コード例 #26
0
    VERBOSE = args.verbose
    plot = args.plot
    block_length = args.block_length
    use_coverage = args.include_cov

    patients = load_patients()
    if pnames is not None:
        patients = patients.iloc[patients.index.isin(pnames)]

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, fragment in enumerate(fragments):
            if VERBOSE >= 1:
                print pname, fragment

            dg, ind, block_length, L = \
                    patient.get_divergence_trajectory_local(fragment,
                                                            block_length=block_length)
            ds, ind, block_length, L = \
                    patient.get_diversity_trajectory_local(fragment,
                                                           block_length=block_length)
            patient.dg = dg
            patient.ds = ds
            patient.ind = ind
コード例 #27
0
                        help='Save annotated reference to file')
    parser.add_argument('--force', action='store_true',
                        help='Go ahead even if annotations differ from existing sequence')

    args = parser.parse_args()
    VERBOSE = args.verbose
    pnames = args.patients
    use_save = args.save
    use_force = args.force

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[patients.index.isin(pnames)]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE:
            print 'Patient:', patient.name

        fn = patient.get_reference_filename('genomewide')
        refseq = SeqIO.read(fn, 'fasta', alphabet=ambiguous_dna)

        fragment_edges = get_edges_fragments(patient, VERBOSE=VERBOSE)
        annotate_sequence(refseq, VERBOSE=VERBOSE,
                          additional_edges={'fragment': fragment_edges})

        if VERBOSE >= 1:
            for feature in refseq.features:
                if feature.id[0] == 'F':
                    continue
コード例 #28
0
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    save_to_file = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]
    if VERBOSE >= 3:
        print 'patients', patients.index
    if not len(patients):
        raise ValueError('No patients found!')

    maps_coord = defaultdict(dict)
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'),
                             patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            coomap = patient.get_map_coordinates_reference(region,
                                                           refname=refname)
コード例 #29
0
ファイル: get_tree_local.py プロジェクト: 5l1v3r1/hivwholeseq
                        help='Minimal frequency to keep the haplotype')

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    maxreads = args.maxreads
    use_plot = args.plot
    freqmin = args.freqmin

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print pname

        if os.path.isfile(
                patient.get_local_tree_filename(roi[0], format='json')):
            if VERBOSE >= 2:
                print 'Get tree'
            region = roi[0]
            tree = patient.get_local_tree(region)

        elif os.path.isfile(
                patient.get_local_tree_filename(' '.join(map(str, roi)),
                                                format='json')):
            if VERBOSE >= 2:
コード例 #30
0
    args = parser.parse_args()
    VERBOSE = args.verbose
    fragments = args.fragments

    patients = load_patients()

    for fragment in fragments:
        if VERBOSE >= 1:
            print fragment

        refs = []
        for pname, patient in patients.iterrows():
            if VERBOSE >= 2:
                print pname
            patient = Patient(patient)
            refs.append(patient.get_reference(fragment))

        ali = align_muscle(*refs, sort=True)

        # Check whether all references are complete (using the longest primers)
        if VERBOSE >= 2:
            print 'Check alignment'
        alim = np.array(ali)
        if (alim[:, :4] == '-').any():
            raise ValueError('Gaps at the beginning of fragment found')
        elif (alim[:, -4:] == '-').any():
            raise ValueError('Gaps at the end of fragment found')

        if VERBOSE >= 2:
            print 'Save to file'
コード例 #31
0
                        help='Regions to analyze (e.g. V3 F6)')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-4]')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose

    patients = load_patients()
    if pnames:
        patients = patients.loc[pnames]

    alis = {}
    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        # Guess regions if not specified
        if regions is None:
            refseq_gw = patient.get_reference('genomewide', 'gb')
            regionspat = map(attrgetter('id'), refseq_gw.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            ali = patient.get_consensi_alignment(region)
コード例 #32
0
    fragments = args.fragments
    VERBOSE = args.verbose
    plot = args.plot
    use_PCR1 = args.PCR1

    patients = load_patients()
    if pnames != ['all']:
        patients = patients.iloc[patients.index.isin(pnames)]

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for fragment in fragments:
            if VERBOSE >= 1:
                print pname, fragment

            covt, ind = patient.get_coverage_trajectories(fragment,
                                                          use_PCR1=use_PCR1)
            samples = patient.samples.iloc[ind]
            times = patient.times[ind]
            ntemplates = samples['n templates']

            if plot is not None:
                import matplotlib.pyplot as plt
コード例 #33
0
    parser.add_argument('--plot', action='store_true',
                        help='Plot local haplotype trajectories')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    use_plot = args.plot

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        for region in regions:
            if VERBOSE >= 1:
                print patient.name, region

            if VERBOSE >= 2:
                print 'Get haplotype counts'
            (hct, ind, seqs) = patient.get_region_count_trajectories(region,
                                                                     VERBOSE=VERBOSE)
            
            times = patient.times[ind]

            if VERBOSE >= 2:
                print 'Align sequences'
            ali = align_muscle(*seqs, sort=True)
コード例 #34
0
    use_save = args.save
    use_recover = args.recover

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    if use_recover:
        for pname, patient in patients.iterrows():
            print pname
            patient = Patient(patient)
            patient.discard_nonsequenced_samples()

            for fragment in fragments:
                if VERBOSE >= 1:
                    print fragment

                fn = patient.get_reference_filename(fragment)
                fn_old = fn.replace('.fasta', '_old.fasta')
                if not os.path.isfile(fn_old):
                    print 'Old reference not found, skipping'
                    continue
                shutil.copy(fn_old, fn)
                os.chmod(fn, (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | \
                              stat.S_IWUSR))
                os.chmod(fn_old, (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | \
コード例 #35
0
    VERBOSE = args.verbose
    plot = args.plot
    block_length = args.block_length
    use_coverage = args.include_cov

    patients = load_patients()
    if pnames is not None:
        patients = patients.iloc[patients.index.isin(pnames)]

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, fragment in enumerate(fragments):
            if VERBOSE >= 1:
                print pname, fragment

            dg, ind, block_length, L = \
                    patient.get_divergence_trajectory_local(fragment,
                                                            block_length=block_length)
            ds, ind, block_length, L = \
                    patient.get_diversity_trajectory_local(fragment,
                                                           block_length=block_length)
            patient.dg = dg
            patient.ds = ds
            patient.ind = ind
コード例 #36
0
    parser.add_argument('--patients', nargs='+', help='Patient to analyze')
    parser.add_argument('--force',
                        action='store_true',
                        help='Do not stop for errors')

    args = parser.parse_args()
    VERBOSE = args.verbose
    pnames = args.patients
    use_force = args.force

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        if VERBOSE >= 1:
            print 'Patient:', patient.name

        ref = patient.get_reference('genomewide', 'gb')

        for fea in ref.features:
            if fea.type == 'protein':
                if VERBOSE >= 2:
                    print 'Checking', fea.id
                try:
                    check_protein(fea, ref, VERBOSE=VERBOSE)
                except ValueError:
                    if use_force:
                        print 'ERROR!'
コード例 #37
0
        Srefind[ind] = i
    Srefind[Srefind < 0] = len(S_bins) - 2
    
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    if VERBOSE >= 1:
        print 'Analyze patients'
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        for fragment in fragments:
            if VERBOSE >= 1:
                print patient.name, fragment

            mapco = patient.get_map_coordinates_reference(fragment, refname=refname)

            if VERBOSE >= 2:
                print 'Get initial allele frequencies'
            af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min)

            if VERBOSE >= 2:
                print 'Get allele frequencies'
            aft, ind = patient.get_allele_frequency_trajectories(fragment,
                                                                 depth_min=depth_min)
コード例 #38
0
    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    plot = args.plot

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]
    pnames = patients.index.tolist()

    data = []

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, region in enumerate(regions):
            if VERBOSE >= 1:
                print pname, region

            try:
                dg, ind = patient.get_divergence(region, cov_min=10)
            except ValueError:
                continue
            times = patient.times[ind]

            data.append({'pname': pname, 'region': region, 'dg': dg, 't': times})

    if VERBOSE >= 1:
コード例 #39
0
                        help='Plot the allele frequency trajectories')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    plot = args.plot

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        for ifr, region in enumerate(regions):
            if VERBOSE >= 1:
                print pname, region

            aft, ind = patient.get_allele_frequency_trajectories(region,
                                                                 cov_min=10)
            times = patient.times[ind]

            dg = get_divergence(aft)
            ds = get_diversity(aft)

            data.append({'pname': pname, 'region': region, 'dg': dg, 'ds': ds, 't': times})
コード例 #40
0
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    save_to_file = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]
    if VERBOSE >= 3:
        print 'patients', patients.index
    if not len(patients):
        raise ValueError('No patients found!')

    maps_coord = defaultdict(dict)
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            coomap = patient.get_map_coordinates_reference(region,
                                                           refname=refname)
コード例 #41
0
        # FIXME: the initial ref of p7 is mislabelled and a mess
    else:
        patients = patients.loc[patients.code != 'p7']

    # Prepare output structures
    n_binsx = 5
    binsy = [
        0., 0.002, 0.01, 0.025, 0.12, 0.33, 0.67, 0.88, 0.975, 0.99, 0.998, 1.
    ]
    props = {(gene, synkey): Propagator(n_binsx,
                                        binsy=binsy,
                                        use_logit=use_logit)
             for gene in genes for synkey in ('syn', 'nonsyn')}

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        samplenames = patient.samples.index

        refseq = patient.get_reference('genomewide', format='gb')

        for gene in genes:

            if VERBOSE >= 1:
                print pname, gene,

            # Get the right fragment(s)
            # FIXME: do better than this ;-)
            frags = {'pol': ['F2', 'F3'], 'gag': ['F1'], 'env': ['F5', 'F6']}
            fragments = frags[gene]

            if VERBOSE >= 1:
コード例 #42
0
        action='store_true',
        help='Plot phylogenetic tree. Requires --save and --tree')

    args = parser.parse_args()
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    use_save = args.save
    use_plot = args.plot

    patients = load_patients()
    if pnames != ['all']:
        patients = patients.iloc[patients.index.isin(pnames)]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        patient.discard_nonsequenced_samples()

        if regions is None:
            refseq_gw = patient.get_reference('genomewide', 'gb')
            regionspat = map(attrgetter('id'),
                             refseq_gw.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region
                if VERBOSE == 1:
                    print ''
コード例 #43
0
    pnames = args.patients
    regions = args.regions
    VERBOSE = args.verbose
    save_to_file = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]
    if VERBOSE >= 3:
        print 'patients', patients.index
    if not len(patients):
        raise ValueError('No patients found!')

    maps_coord = defaultdict(dict)
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            refseq = load_custom_reference(refname, format='gb', region=region)
            patseq = patient.get_reference(region)
コード例 #44
0

# Globals



# Functions



# Script
if __name__ == '__main__':

    patients = load_patients()
    for pname, patient in patients.iterrows():
        patient = Patient(patient)
        print patient.code, patient.name

        # Allele count trajectories
        (inse, ind) = patient.get_insertion_trajectories('genomewide')
        if not ind:
            continue
        inse = pd.Series(inse, name='insertion counts')
        inse.index.names = ['DSI', 'position', 'insertion']

        # Write to file
        fn_out = get_fn_out_traj(patient.code, 'genomewide')
        mkdirs(os.path.dirname(fn_out))
        inse.to_pickle(fn_out)

        # Sample by sample
コード例 #45
0
    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []
    for pname, patient in patients.iterrows():
        if VERBOSE >= 1:
            print patient.code, start, end

        if submit:
            fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE,
                      freqmin=freqmin, countmin=countmin)
            continue

        patient = Patient(patient)
        ref = patient.get_reference('genomewide')
        L = len(ref)

        win_start = start
        while win_start + width - gap < min(L, end):
            win_end = min(win_start + width, end, L)

            if VERBOSE >= 1:
                print patient.code, win_start, win_end
    
            if VERBOSE >= 2:
                print 'Get region haplotypes'
            try:
                datum = patient.get_local_haplotype_count_trajectories(\
                               'genomewide',
コード例 #46
0
    parser.add_argument("--plot", action="store_true", help="Plot the logos")
    parser.add_argument("--interactive", action="store_true", help="Add mouse events to the plot")

    args = parser.parse_args()
    pnames = args.patients
    roi = args.roi
    VERBOSE = args.verbose
    use_plot = args.plot
    use_interactive = args.interactive

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE)
        aft, ind = patient.get_allele_frequency_trajectories(fragment)
        aft = aft[:, :, start:end]

        # TODO: also calculate the logos

        ## Get only some time points
        # i = np.arange(len(ind))[::len(ind) // 2]
        # aft = aft[i]
        # ind = ind[i]

        times = patient.times[ind]

        if use_plot: