def loadK31(reg, filepath, fromHIV=False): ''' Loading data for 31 additional patients Input arguments: reg: name of genetic region (gag or pol) filepath: path to directory where the frequency data are to be stored/downloaded fromHIV: download raw data and store them, if True; use stored data, if False ''' data = {} if fromHIV: sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq") from hivwholeseq.patients.patients import load_patients, Patient pats = load_patients(csv=True) fmt = "%d/%m/%Y" fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w') for pcode, pat in pats.iterrows(): try: EDI = datetime.strptime(pat["infect date best"], fmt) P = Patient(pat) aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0] for si, (scode, sample) in enumerate(P.samples.iterrows()): try: date = datetime.strptime(sample["date"], fmt) af = aft[si] TI = date.toordinal() - EDI.toordinal() fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI)) np.save( filepath + '{}_{}_{}_data.npy'.format(pcode, scode, reg), af.data) np.save( filepath + '{}_{}_{}_mask.npy'.format(pcode, scode, reg), af.mask) data['{}_{}'.format(pcode, scode)] = (date.toordinal() - EDI.toordinal(), af) print(pcode, scode, "WORKED!!!") except: print(scode, "didn't work") except: print("skipping patient ", pcode) fhandle.close() else: with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle: for line in fhandle: words = line.split() pat_name = '_'.join(words[:2]) af_data = np.load(filepath + '{}_{}_data.npy'.format(pat_name, reg)) af_mask = np.load(filepath + '{}_{}_mask.npy'.format(pat_name, reg)) af = np.ma.masked_array(af_data, mask=af_mask) data[pat_name] = (int(words[2]), af) return data
def get_template_numbers(patients, VERBOSE=0): '''Collect template numbers from all patient samples''' data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE: print pname, patient.code samples = patient.samples n_approx = samples['templates approx'] dils = [get_dilution(x) for x in samples['dilutions']] n_dils = [2 * estimate_ntemplates_Poisson(x) for x in dils] # Attach sample date info age = np.array((datetime.datetime.now() - samples.date)) / 86400e9 data.append({ 'n_approx': n_approx, 'n_dil': n_dils, 'age': age, 'pname': patient.code }) return data
def get_divergence_diversity_sliding(aft, block_length, VERBOSE=0): '''Get local divergence and diversity in a sliding window''' cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True) ind_N = cons_ind == 5 cons_ind[ind_N] = 0 aft_nonanc = 1.0 - aft[:, cons_ind, np.arange(aft.shape[2])] aft_nonanc[:, ind_N] = 0 aft_var = (aft * (1 - aft)).sum(axis=1) struct = np.ones(block_length) dg = np.ma.array( np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=aft_nonanc), hard_mask=True) ds = np.ma.array(np.apply_along_axis( lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=aft_var), hard_mask=True) # NOTE: normalization happens based on actual coverage norm = np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=(-aft[:, 0].mask)) dg.mask = norm < block_length dg /= norm ds.mask = norm < block_length ds /= norm x = np.arange(dg.shape[1]) + (block_length - 1) / 2.0 return (x, dg, ds)
def get_divergence_diversity_sliding(aft, block_length, VERBOSE=0): '''Get local divergence and diversity in a sliding window''' cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True) ind_N = cons_ind == 5 cons_ind[ind_N] = 0 aft_nonanc = 1.0 - aft[:, cons_ind, np.arange(aft.shape[2])] aft_nonanc[:, ind_N] = 0 aft_var = (aft * (1 - aft)).sum(axis=1) struct = np.ones(block_length) dg = np.ma.array(np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=aft_nonanc), hard_mask=True) ds = np.ma.array(np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=aft_var), hard_mask=True) # NOTE: normalization happens based on actual coverage norm = np.apply_along_axis(lambda x: np.convolve(x, struct, mode='valid'), axis=1, arr=(-aft[:, 0].mask)) dg.mask = norm < block_length dg /= norm ds.mask = norm < block_length ds /= norm x = np.arange(dg.shape[1]) + (block_length - 1) / 2.0 return (x, dg, ds)
pnames = args.patients regions = args.regions VERBOSE = args.verbose save_to_file = args.save patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if VERBOSE >= 3: print 'patients', patients.index if not len(patients): raise ValueError('No patients found!') maps_coord = defaultdict(dict) for pname, patient in patients.iterrows(): patient = Patient(patient) # Make maps for all annotations if not explicit if regions is None: patseqann = patient.get_reference('genomewide', format='gb') regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region refseq = load_custom_reference(refname, format='gb', region=region) patseq = patient.get_reference(region)
for pname, patient in patients.iterrows(): if VERBOSE >= 1: print patient.code, start, end if submit: fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE, freqmin=freqmin, countmin=countmin) continue patient = Patient(patient) ref = patient.get_reference('genomewide') L = len(ref) win_start = start while win_start + width - gap < min(L, end): win_end = min(win_start + width, end, L) if VERBOSE >= 1: print patient.code, win_start, win_end if VERBOSE >= 2: print 'Get region haplotypes' try: datum = patient.get_local_haplotype_count_trajectories(\ 'genomewide',
af_bd = [0.05, 0.95] data = {} patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): print pname patient = Patient(patient) patient.discard_nonsequenced_samples() t_bds = [] t_loss = [] t_fixs = [] n_staypolys = [] for fragment in fragments: if VERBOSE >= 1: print fragment # Collect allele counts from patient samples, and return only positive hits # sns contains sample names and PCR types (aft, ind) = patient.get_allele_frequency_trajectories( fragment, cov_min=cov_min,
parser.add_argument('--plot', action='store_true', help='Plot phylogenetic tree. Requires --save and --tree') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose use_save = args.save use_plot = args.plot patients = load_patients() if pnames != ['all']: patients = patients.iloc[patients.index.isin(pnames)] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() if regions is None: refseq_gw = patient.get_reference('genomewide', 'gb') regionspat = map(attrgetter('id'), refseq_gw.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region if VERBOSE == 1: print ''
help='Patient to analyze') parser.add_argument('--force', action='store_true', help='Do not stop for errors') args = parser.parse_args() VERBOSE = args.verbose pnames = args.patients use_force = args.force patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print 'Patient:', patient.name ref = patient.get_reference('genomewide', 'gb') for fea in ref.features: if fea.type == 'protein': if VERBOSE >= 2: print 'Checking', fea.id try: check_protein(fea, ref, VERBOSE=VERBOSE) except ValueError: if use_force: print 'ERROR!'
parser.add_argument('--plot', action='store_true', help='Plot local haplotype trajectories') args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() if VERBOSE >= 1: print patient.name, roi if VERBOSE >= 2: print 'Get haplotype trajectories' try: (ht, indt, htseqs) = patient.get_region_count_trajectories(roi[0], VERBOSE=VERBOSE) except IOError: (ht, indt, htseqs) = patient.get_local_haplotype_count_trajectories(roi, VERBOSE=VERBOSE) if VERBOSE >= 2:
help='Minimal frequency to keep the haplotype') args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose maxreads = args.maxreads use_plot = args.plot freqmin = args.freqmin patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print pname if os.path.isfile(patient.get_local_tree_filename(roi[0], format='json')): if VERBOSE >= 2: print 'Get tree' region = roi[0] tree = patient.get_local_tree(region) elif os.path.isfile(patient.get_local_tree_filename(' '.join(map(str, roi)), format='json')): if VERBOSE >= 2: print 'Get tree' region = ' '.join(map(str, roi)) tree = patient.get_local_tree(region)
'--force', action='store_true', help='Go ahead even if annotations differ from existing sequence') args = parser.parse_args() VERBOSE = args.verbose pnames = args.patients use_save = args.save use_force = args.force patients = load_patients() if pnames is not None: patients = patients.loc[patients.index.isin(pnames)] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE: print 'Patient:', patient.name fn = patient.get_reference_filename('genomewide') refseq = SeqIO.read(fn, 'fasta', alphabet=ambiguous_dna) fragment_edges = get_edges_fragments(patient, VERBOSE=VERBOSE) annotate_sequence(refseq, VERBOSE=VERBOSE, additional_edges={'fragment': fragment_edges}) if VERBOSE >= 1: for feature in refseq.features: if feature.id[0] == 'F':
af_bd = [0.05, 0.95] data = {} patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): print pname patient = Patient(patient) patient.discard_nonsequenced_samples() t_bds = [] t_loss = [] t_fixs = [] n_staypolys = [] for fragment in fragments: if VERBOSE >= 1: print fragment # Collect allele counts from patient samples, and return only positive hits # sns contains sample names and PCR types (aft, ind) = patient.get_allele_frequency_trajectories(fragment, cov_min=cov_min, depth_min=depth_min,
n_binsx = 8 binsy = [0., 0.002, 0.005, 0.009, 0.013, 0.025, 0.04136464, 0.08089993, 0.12077255, 0.16115779, 0.2022444 , 0.24424043, 0.28738044, 0.33193475, 0.37822187, 0.42662549, 0.4776187 , 0.53179937, 0.58994409, 0.65309361, 0.72269518, 0.80085467, 0.89081905, 0.95, 0.975, 0.987, 0.991, 0.994, 0.998, 1.] pp = Propagator(n_binsx, binsy=binsy, use_logit=use_logit) hist = pp.histogram for pname, patient in patients.iterrows(): patient = Patient(patient) samplenames = patient.samples.index if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments # Iterate over samples and fragments for fragment in fragments: if VERBOSE >= 1: print pname, fragment aft, ind = patient.get_allele_frequency_trajectories(fragment, cov_min=depth_min)
fragments = args.fragments VERBOSE = args.verbose plot = args.plot use_PCR1 = args.PCR1 patients = load_patients() if pnames != ['all']: patients = patients.iloc[patients.index.isin(pnames)] if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for fragment in fragments: if VERBOSE >= 1: print pname, fragment covt, ind = patient.get_coverage_trajectories(fragment, use_PCR1=use_PCR1) samples = patient.samples.iloc[ind] times = patient.times[ind] ntemplates = samples['n templates'] if plot is not None: import matplotlib.pyplot as plt
action='store_true', help='Add mouse events to the plot') args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot use_interactive = args.interactive patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE) aft, ind = patient.get_allele_frequency_trajectories(fragment) aft = aft[:, :, start:end] # TODO: also calculate the logos ## Get only some time points #i = np.arange(len(ind))[::len(ind) // 2] #aft = aft[i] #ind = ind[i] times = patient.times[ind]
VERBOSE = args.verbose pnames = args.patients use_force = args.force fragments = args.fragments if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print 'Patient:', patient.name patient.discard_nonsequenced_samples() for fragment in fragments: if VERBOSE >= 1: print fragment # Check whether a reference exists at all ref_fn = patient.get_reference_filename(fragment) if not os.path.isfile(ref_fn): print 'ERROR: reference for fragment', fragment, 'not found!' continue
bins = np.exp(tbins)/(1+np.exp(tbins)) binsc = np.sqrt(bins[1:] * bins[:-1]) binw = np.diff(bins) hist = np.zeros_like(binsc) if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) for fragment in fragments: if VERBOSE >= 1: print patient.name, fragment if VERBOSE >= 2: print 'Get initial allele frequencies' af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min) if VERBOSE >= 2: print 'Get allele frequencies' aft, ind = patient.get_allele_frequency_trajectories(fragment, depth_min=depth_min) if VERBOSE >= 2:
help='Regions to analyze (e.g. V3 F6)') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose use_plot = args.plot patients = load_patients() if pnames: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for region in regions: if VERBOSE >= 1: print pname, region ali = patient.get_consensi_alignment(region) tree = patient.get_consensi_tree(region, format='json') if use_plot: fig, ax = plt.subplots(figsize=(15, 12)) Phylo.draw(tree, do_show=False, axes=ax) ax.set_title(pname+', '+region) x_max = max(tree.depths().itervalues())
patients = patients.loc[pnames] # FIXME: the initial ref of p7 is mislabelled and a mess else: patients = patients.loc[patients.code != "p7"] # Prepare output structures n_binsx = 5 binsy = [0.0, 0.002, 0.01, 0.025, 0.12, 0.33, 0.67, 0.88, 0.975, 0.99, 0.998, 1.0] props = { (gene, synkey): Propagator(n_binsx, binsy=binsy, use_logit=use_logit) for gene in genes for synkey in ("syn", "nonsyn") } for pname, patient in patients.iterrows(): patient = Patient(patient) samplenames = patient.samples.index refseq = patient.get_reference("genomewide", format="gb") for gene in genes: if VERBOSE >= 1: print pname, gene, # Get the right fragment(s) # FIXME: do better than this ;-) frags = {"pol": ["F2", "F3"], "gag": ["F1"], "env": ["F5", "F6"]} fragments = frags[gene] if VERBOSE >= 1:
def get_divergence(aft): '''Get divergence from allele frequency trajectories''' cons_ind = Patient.get_initial_consensus_noinsertions(aft, return_ind=True) dg = 1 - aft[:, cons_ind, np.arange(aft.shape[2])].mean(axis=1) return dg
patients_fn = dst_fn+'patients/' ref_fn = dst_fn+'reference/' print 'Make root folders' mkdirs(patients_fn) mkdirs(ref_fn) print 'Reference sequences' copy_reference(ref_fn) patients = load_patients() for pname, patient in patients.iterrows(): print pname patient = Patient(patient) print 'Make folder' pat_fn = patients_fn+pname+os.sep mkdirs(pat_fn) print 'Mapping reference' copy_initial_reference(patient, pat_fn) print 'Coordinate maps' copy_folder(patient, pat_fn, 'coordinate_maps') print 'Alignments' copy_folder(patient, pat_fn, 'alignments')
action='store_true', help='Plot local haplotype trajectories') args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() if VERBOSE >= 1: print patient.name, roi if VERBOSE >= 2: print 'Get haplotype trajectories' try: (ht, indt, htseqs) = patient.get_region_count_trajectories(roi[0], VERBOSE=VERBOSE) except IOError: (ht, indt, htseqs) = patient.get_local_haplotype_count_trajectories( roi, VERBOSE=VERBOSE)
args = parser.parse_args() VERBOSE = args.verbose fragments = args.fragments patients = load_patients() for fragment in fragments: if VERBOSE >= 1: print fragment refs = [] for pname, patient in patients.iterrows(): if VERBOSE >= 2: print pname patient = Patient(patient) refs.append(patient.get_reference(fragment)) ali = align_muscle(*refs, sort=True) # Check whether all references are complete (using the longest primers) if VERBOSE >= 2: print 'Check alignment' alim = np.array(ali) if (alim[:, :4] == '-').any(): raise ValueError('Gaps at the beginning of fragment found') elif (alim[:, -4:] == '-').any(): raise ValueError('Gaps at the end of fragment found') if VERBOSE >= 2: print 'Save to file'
VERBOSE = args.verbose pnames = args.patients use_force = args.force fragments = args.fragments if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print 'Patient:', patient.name patient.discard_nonsequenced_samples() for fragment in fragments: if VERBOSE >= 1: print fragment # Check whether a reference exists at all ref_fn = patient.get_reference_filename(fragment) if not os.path.isfile(ref_fn): print 'ERROR: reference for fragment', fragment, 'not found!' continue
VERBOSE = args.verbose plot = args.plot block_length = args.block_length use_coverage = args.include_cov patients = load_patients() if pnames is not None: patients = patients.iloc[patients.index.isin(pnames)] if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, fragment in enumerate(fragments): if VERBOSE >= 1: print pname, fragment dg, ind, block_length, L = \ patient.get_divergence_trajectory_local(fragment, block_length=block_length) ds, ind, block_length, L = \ patient.get_diversity_trajectory_local(fragment, block_length=block_length) patient.dg = dg patient.ds = ds patient.ind = ind
help='Save annotated reference to file') parser.add_argument('--force', action='store_true', help='Go ahead even if annotations differ from existing sequence') args = parser.parse_args() VERBOSE = args.verbose pnames = args.patients use_save = args.save use_force = args.force patients = load_patients() if pnames is not None: patients = patients.loc[patients.index.isin(pnames)] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE: print 'Patient:', patient.name fn = patient.get_reference_filename('genomewide') refseq = SeqIO.read(fn, 'fasta', alphabet=ambiguous_dna) fragment_edges = get_edges_fragments(patient, VERBOSE=VERBOSE) annotate_sequence(refseq, VERBOSE=VERBOSE, additional_edges={'fragment': fragment_edges}) if VERBOSE >= 1: for feature in refseq.features: if feature.id[0] == 'F': continue
pnames = args.patients regions = args.regions VERBOSE = args.verbose save_to_file = args.save patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if VERBOSE >= 3: print 'patients', patients.index if not len(patients): raise ValueError('No patients found!') maps_coord = defaultdict(dict) for pname, patient in patients.iterrows(): patient = Patient(patient) # Make maps for all annotations if not explicit if regions is None: patseqann = patient.get_reference('genomewide', format='gb') regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region coomap = patient.get_map_coordinates_reference(region, refname=refname)
help='Minimal frequency to keep the haplotype') args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose maxreads = args.maxreads use_plot = args.plot freqmin = args.freqmin patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print pname if os.path.isfile( patient.get_local_tree_filename(roi[0], format='json')): if VERBOSE >= 2: print 'Get tree' region = roi[0] tree = patient.get_local_tree(region) elif os.path.isfile( patient.get_local_tree_filename(' '.join(map(str, roi)), format='json')): if VERBOSE >= 2:
help='Regions to analyze (e.g. V3 F6)') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose patients = load_patients() if pnames: patients = patients.loc[pnames] alis = {} for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() # Guess regions if not specified if regions is None: refseq_gw = patient.get_reference('genomewide', 'gb') regionspat = map(attrgetter('id'), refseq_gw.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region ali = patient.get_consensi_alignment(region)
fragments = args.fragments VERBOSE = args.verbose plot = args.plot use_PCR1 = args.PCR1 patients = load_patients() if pnames != ['all']: patients = patients.iloc[patients.index.isin(pnames)] if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for fragment in fragments: if VERBOSE >= 1: print pname, fragment covt, ind = patient.get_coverage_trajectories(fragment, use_PCR1=use_PCR1) samples = patient.samples.iloc[ind] times = patient.times[ind] ntemplates = samples['n templates'] if plot is not None: import matplotlib.pyplot as plt
parser.add_argument('--plot', action='store_true', help='Plot local haplotype trajectories') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose use_plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) for region in regions: if VERBOSE >= 1: print patient.name, region if VERBOSE >= 2: print 'Get haplotype counts' (hct, ind, seqs) = patient.get_region_count_trajectories(region, VERBOSE=VERBOSE) times = patient.times[ind] if VERBOSE >= 2: print 'Align sequences' ali = align_muscle(*seqs, sort=True)
use_save = args.save use_recover = args.recover patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments if use_recover: for pname, patient in patients.iterrows(): print pname patient = Patient(patient) patient.discard_nonsequenced_samples() for fragment in fragments: if VERBOSE >= 1: print fragment fn = patient.get_reference_filename(fragment) fn_old = fn.replace('.fasta', '_old.fasta') if not os.path.isfile(fn_old): print 'Old reference not found, skipping' continue shutil.copy(fn_old, fn) os.chmod(fn, (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | \ stat.S_IWUSR)) os.chmod(fn_old, (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | \
VERBOSE = args.verbose plot = args.plot block_length = args.block_length use_coverage = args.include_cov patients = load_patients() if pnames is not None: patients = patients.iloc[patients.index.isin(pnames)] if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, fragment in enumerate(fragments): if VERBOSE >= 1: print pname, fragment dg, ind, block_length, L = \ patient.get_divergence_trajectory_local(fragment, block_length=block_length) ds, ind, block_length, L = \ patient.get_diversity_trajectory_local(fragment, block_length=block_length) patient.dg = dg patient.ds = ds patient.ind = ind
parser.add_argument('--patients', nargs='+', help='Patient to analyze') parser.add_argument('--force', action='store_true', help='Do not stop for errors') args = parser.parse_args() VERBOSE = args.verbose pnames = args.patients use_force = args.force patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) if VERBOSE >= 1: print 'Patient:', patient.name ref = patient.get_reference('genomewide', 'gb') for fea in ref.features: if fea.type == 'protein': if VERBOSE >= 2: print 'Checking', fea.id try: check_protein(fea, ref, VERBOSE=VERBOSE) except ValueError: if use_force: print 'ERROR!'
Srefind[ind] = i Srefind[Srefind < 0] = len(S_bins) - 2 if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments patients = load_patients() if pnames is not None: patients = patients.loc[pnames] if VERBOSE >= 1: print 'Analyze patients' for pname, patient in patients.iterrows(): patient = Patient(patient) for fragment in fragments: if VERBOSE >= 1: print patient.name, fragment mapco = patient.get_map_coordinates_reference(fragment, refname=refname) if VERBOSE >= 2: print 'Get initial allele frequencies' af0 = patient.get_initial_allele_frequencies(fragment, cov_min=depth_min) if VERBOSE >= 2: print 'Get allele frequencies' aft, ind = patient.get_allele_frequency_trajectories(fragment, depth_min=depth_min)
args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] pnames = patients.index.tolist() data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, region in enumerate(regions): if VERBOSE >= 1: print pname, region try: dg, ind = patient.get_divergence(region, cov_min=10) except ValueError: continue times = patient.times[ind] data.append({'pname': pname, 'region': region, 'dg': dg, 't': times}) if VERBOSE >= 1:
help='Plot the allele frequency trajectories') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, region in enumerate(regions): if VERBOSE >= 1: print pname, region aft, ind = patient.get_allele_frequency_trajectories(region, cov_min=10) times = patient.times[ind] dg = get_divergence(aft) ds = get_diversity(aft) data.append({'pname': pname, 'region': region, 'dg': dg, 'ds': ds, 't': times})
# FIXME: the initial ref of p7 is mislabelled and a mess else: patients = patients.loc[patients.code != 'p7'] # Prepare output structures n_binsx = 5 binsy = [ 0., 0.002, 0.01, 0.025, 0.12, 0.33, 0.67, 0.88, 0.975, 0.99, 0.998, 1. ] props = {(gene, synkey): Propagator(n_binsx, binsy=binsy, use_logit=use_logit) for gene in genes for synkey in ('syn', 'nonsyn')} for pname, patient in patients.iterrows(): patient = Patient(patient) samplenames = patient.samples.index refseq = patient.get_reference('genomewide', format='gb') for gene in genes: if VERBOSE >= 1: print pname, gene, # Get the right fragment(s) # FIXME: do better than this ;-) frags = {'pol': ['F2', 'F3'], 'gag': ['F1'], 'env': ['F5', 'F6']} fragments = frags[gene] if VERBOSE >= 1:
action='store_true', help='Plot phylogenetic tree. Requires --save and --tree') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose use_save = args.save use_plot = args.plot patients = load_patients() if pnames != ['all']: patients = patients.iloc[patients.index.isin(pnames)] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() if regions is None: refseq_gw = patient.get_reference('genomewide', 'gb') regionspat = map(attrgetter('id'), refseq_gw.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region if VERBOSE == 1: print ''
# Globals # Functions # Script if __name__ == '__main__': patients = load_patients() for pname, patient in patients.iterrows(): patient = Patient(patient) print patient.code, patient.name # Allele count trajectories (inse, ind) = patient.get_insertion_trajectories('genomewide') if not ind: continue inse = pd.Series(inse, name='insertion counts') inse.index.names = ['DSI', 'position', 'insertion'] # Write to file fn_out = get_fn_out_traj(patient.code, 'genomewide') mkdirs(os.path.dirname(fn_out)) inse.to_pickle(fn_out) # Sample by sample
patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): if VERBOSE >= 1: print patient.code, start, end if submit: fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE, freqmin=freqmin, countmin=countmin) continue patient = Patient(patient) ref = patient.get_reference('genomewide') L = len(ref) win_start = start while win_start + width - gap < min(L, end): win_end = min(win_start + width, end, L) if VERBOSE >= 1: print patient.code, win_start, win_end if VERBOSE >= 2: print 'Get region haplotypes' try: datum = patient.get_local_haplotype_count_trajectories(\ 'genomewide',
parser.add_argument("--plot", action="store_true", help="Plot the logos") parser.add_argument("--interactive", action="store_true", help="Add mouse events to the plot") args = parser.parse_args() pnames = args.patients roi = args.roi VERBOSE = args.verbose use_plot = args.plot use_interactive = args.interactive patients = load_patients() if pnames is not None: patients = patients.loc[pnames] for pname, patient in patients.iterrows(): patient = Patient(patient) (fragment, start, end) = patient.get_fragmented_roi(roi, VERBOSE=VERBOSE) aft, ind = patient.get_allele_frequency_trajectories(fragment) aft = aft[:, :, start:end] # TODO: also calculate the logos ## Get only some time points # i = np.arange(len(ind))[::len(ind) // 2] # aft = aft[i] # ind = ind[i] times = patient.times[ind] if use_plot: