def loadK31(reg, filepath, fromHIV=False): ''' Loading data for 31 additional patients Input arguments: reg: name of genetic region (gag or pol) filepath: path to directory where the frequency data are to be stored/downloaded fromHIV: download raw data and store them, if True; use stored data, if False ''' data = {} if fromHIV: sys.path.append("/scicore/home/neher/neher/HIV/hivwholeseq") from hivwholeseq.patients.patients import load_patients, Patient pats = load_patients(csv=True) fmt = "%d/%m/%Y" fhandle = open(filepath + 'K31_info_{}.txt'.format(reg), 'w') for pcode, pat in pats.iterrows(): try: EDI = datetime.strptime(pat["infect date best"], fmt) P = Patient(pat) aft = P.get_allele_frequency_trajectories(reg, cov_min=500)[0] for si, (scode, sample) in enumerate(P.samples.iterrows()): try: date = datetime.strptime(sample["date"], fmt) af = aft[si] TI = date.toordinal() - EDI.toordinal() fhandle.write('{}\t{}\t{}\n'.format(pcode, scode, TI)) np.save( filepath + '{}_{}_{}_data.npy'.format(pcode, scode, reg), af.data) np.save( filepath + '{}_{}_{}_mask.npy'.format(pcode, scode, reg), af.mask) data['{}_{}'.format(pcode, scode)] = (date.toordinal() - EDI.toordinal(), af) print(pcode, scode, "WORKED!!!") except: print(scode, "didn't work") except: print("skipping patient ", pcode) fhandle.close() else: with open(filepath + 'K31_info_{}.txt'.format(reg), 'r') as fhandle: for line in fhandle: words = line.split() pat_name = '_'.join(words[:2]) af_data = np.load(filepath + '{}_{}_data.npy'.format(pat_name, reg)) af_mask = np.load(filepath + '{}_{}_mask.npy'.format(pat_name, reg)) af = np.ma.masked_array(af_data, mask=af_mask) data[pat_name] = (int(words[2]), af) return data
help='Save alignment to file') args = parser.parse_args() pnames = args.patients width = args.width gap = args.gap start = args.start end = args.end VERBOSE = args.verbose freqmin = args.freqmin countmin = args.countmin submit = args.submit use_plot = args.plot use_save = args.save patients = load_patients() if pnames is not None: patients = patients.loc[pnames] data = [] for pname, patient in patients.iterrows(): if VERBOSE >= 1: print patient.code, start, end if submit: fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE, freqmin=freqmin, countmin=countmin) continue patient = Patient(patient) ref = patient.get_reference('genomewide')
parser.add_argument('--patients', nargs='+', help='Patients to analyze') parser.add_argument('--regions', nargs='+', required=True, help='Regions to analyze (e.g. F1 V3)') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') parser.add_argument('--plot', nargs='?', default=None, const='2D', help='Plot the allele frequency trajectories') args = parser.parse_args() pnames = args.patients regions = args.regions VERBOSE = args.verbose plot = args.plot patients = load_patients() if pnames is not None: patients = patients.loc[pnames] pnames = patients.index.tolist() data = [] for pname, patient in patients.iterrows(): patient = Patient(patient) patient.discard_nonsequenced_samples() for ifr, region in enumerate(regions): if VERBOSE >= 1: print pname, region try: