def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False): """Iterate through a range of baseline cutoffs and compare the results. Args: vis (str): the name of the core data file that this is pulling. baselines (list of ints): the baselines to check over. """ # Set up the symlink run_dir = './baselines/baseline_' + mol + str(int(niters)) + '/' scratch_dir = '/scratch/jonas/' + run_dir orig_vis = './data/' + mol + '/' + mol new_vis = run_dir + mol if remake_all is True or already_exists(new_vis) is False: remove(scratch_dir) # :-1 because a symlink with a deleted root isn't a directory anymore remove(run_dir[:-1]) sp.call(['mkdir {}'.format(scratch_dir)], shell=True) sp.call(['ln', '-s', scratch_dir, './baselines/']) sp.call(['cp', '-r', '{}.vis'.format(orig_vis), '{}/'.format(run_dir)]) print "Made symlinked directory, copied core .vis over.\n\n" data_list = [] for b in baselines: print '\n\n\n NEW ITERATION\nBaseline: ', b, '\n' name = run_dir + mol + str(b) if b != 0 else run_dir + mol # Check if we've already icr'ed this one. if already_exists(name + '.cm') is True: print "File already exists; going straight to imstat" mean, rms = imstat(name, ext='.cm') else: icr(new_vis, mol=mol, min_baseline=b, niters=niters) mean, rms = imstat(name, ext='.cm') step_output = {'RMS': rms, 'Mean': mean, 'Baseline': b} data_list.append(step_output) print step_output data_pd = pd.DataFrame(data_list) return data_pd
def get_baseline_rmss(mol, niters=1e4, baselines=baselines, remake_all=False): """Iterate through a range of baseline cutoffs and compare the results. Args: vis (str): the name of the core data file that this is pulling. baselines (list of ints): the baselines to check over. """ # Set up the symlink run_dir = './data/' + mol + '/baseline_testing/' orig_vis = './data/' + mol + '/' + mol new_vis = run_dir + mol if remake_all is True or already_exists(new_vis) is False: sp.call(['mkdir {}'.format(run_dir)], shell=True) sp.call(['cp', '-r', '{}.vis'.format(orig_vis), '{}'.format(run_dir)]) data_list = [] for b in baselines: print('\n\n\n NEW ITERATION\nBaseline: ', b, '\n') name = run_dir + mol + str(b) if b != 0 else run_dir + mol # Check if we've already icr'ed this one. if already_exists(name + '.cm') is True: print("File already exists; going straight to imstat") mean, rms = imstat(name, ext='.cm') else: icr(new_vis, mol=mol, min_baseline=b, niters=niters) mean, rms = imstat(name, ext='.cm') step_output = {'RMS': rms, 'Mean': mean, 'Baseline': b} data_list.append(step_output) print(step_output) data_pd = pd.DataFrame(data_list) return data_pd
def run_full_pipeline(): """Run the whole thing. Note that this no longer produces both cut and uncut output; since the cut happens much earlier, it now only produces one or the other (depending on whether or not cut_baselines is true.) The Process: - casa_sequence(): - cvel the cont-sub'ed dataset from jonas/raw_data to here. - split out the 50 channels around restfreq - convert that .ms to a .uvf - var_vis(): pull in that .uvf, add variances, resulting in another uvf - convert that to a vis - icr that vis to get a cm - cm to fits; now we have mol.{{uvf, vis, fits, cm}} - delete the clutter files: _split, _cvel, _exportuvfits, bm, cl, mp """ t0 = time.time() mol = input('Which line (HCN, HCO, CS, or CO)?\n').lower() cut = input('Cut baselines for better signal (y/n)?\n').lower() cut_baselines = True if cut == 'y' else False remake = input('Remake everything (y/n)?\n') remake_all = True if remake.lower() == 'y' else False # Paths to the data jonas = '/Volumes/disks/jonas/' raw_data_path = jonas + 'raw_data/' final_data_path = jonas + 'modeling/data/' + mol + '/' name = mol if cut_baselines is True: name += '-short' + str(lines[mol]['baseline_cutoff']) # Establish a string for the log file to be made at the end log = 'Files created on ' + today + '\n\n' if remake_all is True: # This doesn't work yet. print("Remaking everything; emptied line dir and remaking.") remove(final_data_path + '*') log += "Full remake occured; all files are fresh.\n\n" else: log += "Some files already existed and so were not remade.\n" log += "Careful for inconsistencies.\n\n" print("Now processing data....") casa_sequence(mol, raw_data_path, final_data_path + name, cut_baselines) print("Running varvis....\n\n") if already_exists(final_data_path + name + '.uvf') is False: # Note that var_vis takes in mol_exportuvfits, returns mol.uvf var_vis(final_data_path + name) print("Finished varvis; converting uvf to vis now....\n\n") # Note that this is different than lines[mol][chan0_freq] bc # it's dealing with the chopped vis set restfreq = lines[mol]['restfreq'] f = fits.getheader(final_data_path + name + '.uvf') # chan0_freq = (f['CRVAL4'] - (f['CRPIX4']-1) * f['CDELT4']) * 1e-9 # Using the same math as in lines 130-135 # chan0_vel = c * (chan0_freq - restfreq)/restfreq data, header = fits.getdata(final_data_path + name + '.uvf', header=True) header['RESTFREQ'] = restfreq * 1e9 fits.writeto(final_data_path + name + '.uvf', data, header, overwrite=True) if already_exists(final_data_path + name + '.vis') is False: sp.Popen( [ 'fits', 'op=uvin', 'in={}.uvf'.format(name), # DONT PUT THIS BACK IN # Or if you do, flip the sign of chan0_vel to pos # 'velocity=lsr,{},1'.format(chan0_vel), 'out={}.vis'.format(name) ], cwd=final_data_path).wait() print("Convolving data to get image, converting output to .fits\n\n") if already_exists(final_data_path + name + '.cm') is False: icr(final_data_path + name, mol=mol) print("Deleting the junk process files...\n\n") fpath = final_data_path + name files_to_remove = [ fpath + '.bm', fpath + '_split.*', fpath + '.cl', fpath + '_cvel.*', fpath + '.mp', fpath + '_exportuvfits.*', 'casa*.log', '*.last' ] remove(files_to_remove) tf = time.time() t_total = (tf - t0) / 60 log += '\nThis processing took ' + str(t_total) + ' minutes.' with open(final_data_path + 'file_log.txt', 'w') as f: f.write(log) print("All done! This processing took " + str(t_total) + " minutes.")
def fullRun(diskAParams, diskBParams, mol, use_a_previous_result=False, cut_central_chans=False): """Run it all. diskXParams are fed in from full_run.py, where the parameter selections are made. """ t0 = time.time() # Calculate the number of steps and consequent runtime na = 1 for a in diskAParams: na *= len(diskAParams[a]) nb = 1 for b in diskBParams: nb *= len(diskBParams[b]) n, dt = na + nb, 2.1 t = n * dt if t <= 60: t = str(round(n * dt, 2)) + " minutes." elif t > 60 and t <= 1440: t = str(round(n * dt / 60, 2)) + " hours." elif t >= 1440: t = str(round(n * dt / 1440, 2)) + " days." # Update the chi2 containers to be the right sizes. diskA_shape = [len(diskAParams[p]) for p in param_names] diskB_shape = [len(diskBParams[p]) for p in param_names] global diskARawX2 diskARawX2 = np.zeros(diskA_shape) global diskARedX2 diskARedX2 = np.zeros(diskA_shape) global diskBRawX2 diskBRawX2 = np.zeros(diskB_shape) global diskBRedX2 diskBRedX2 = np.zeros(diskB_shape) # Begin setting up symlink and get directory paths lined up this_run_basename = today + '_' + mol this_run = this_run_basename modelPath = './gridsearch_runs/' + this_run run_counter = 2 # while already_exists_old(modelPath) is True: # while already_exists('/'.join(modelPath.split('/')[:-1])) is True: while already_exists(modelPath) is True: this_run = this_run_basename + '-' + str(run_counter) modelPath = './gridsearch_runs/' + this_run run_counter += 1 # Add on the file base name to the path. modelPath += '/' + this_run # Parameter Check: print("\nThis run will fit for", mol.upper()) print("It will iterate through these parameters for Disk A:") for p in diskAParams: print(p, ': ', diskAParams[p]) print("\nAnd these values for Disk B:") for p in diskBParams: print(p, ': ', diskBParams[p]) print("\nThis run will take", n, "steps, spanning about", t) print("Output will be in", modelPath, '\n') response = input('Sound good? (Enter to begin, anything else to stop)\n') if response != "": return "\nGo fix whatever you don't like and try again.\n\n" else: print("Sounds good!\n") new_dir = '/Volumes/disks/jonas/modeling/gridsearch_runs/' + this_run sp.call(['mkdir', 'gridsearch_runs/' + this_run]) # CHECK FOR REUSE """This is a little bit janky looking but makes sense. Since we are treating the two disks as independent, then if, in one run, we find good fits (no edge values), then it doesn't make sense to run that grid again; it would be better to just grab the relevant information from that run and only fit the disk that needs fitting. That's what this is for.""" to_skip = '' if use_a_previous_result is True: response2 = input( 'Please enter the path to the .fits file to use from a previous', 'run (should be ./models/date/run_date/datefitted_[A/B].fits)\n') if 'A' in response2: to_skip = 'fitted_A' elif 'B' in response2: to_skip = 'fitted_B' else: print( "Bad path; must have 'fitted_A or fitted_B' in it. Try again") return # STARTING THE RUN # # Make the initial static model (B), just with the first parameter values dBInit = {} for p in diskBParams: dBInit[p] = diskBParams[p][0] # Grid search over Disk A, retrieve the resulting pd.DataFrame if to_skip != 'A': df_A_fit = gridSearch(diskAParams, dBInit, mol, 0, modelPath, n, cut_central_chans=cut_central_chans) # Find where the chi2 is minimized and save it idx_of_BF_A = df_A_fit.index[df_A_fit['Reduced Chi2'] == np.min( df_A_fit['Reduced Chi2'])][0] print("Index of Best Fit, A is ", idx_of_BF_A) # Make a list of those parameters to pass the next round of grid searching. fit_A_params = {} for param in df_A_fit.columns: fit_A_params[param] = df_A_fit[param][idx_of_BF_A] print("First disk has been fit\n") # Now search over the other disk df_B_fit = gridSearch(diskBParams, fit_A_params, mol, 1, modelPath, n, steps_so_far=na, cut_central_chans=cut_central_chans) idx_of_BF_B = df_B_fit.index[df_B_fit['Reduced Chi2'] == np.min( df_B_fit['Reduced Chi2'])][0] fit_B_params = {} for param in df_B_fit.columns: fit_B_params[param] = df_B_fit[param][idx_of_BF_B] # Bind the data frames, output them. # Reiterated in tools.py/depickler(), but we can unwrap these vals with: # full_log.loc['A', :] to get all the columns for disk A, or # full_log[:, 'Incl.'] to see which inclinations both disks tried. full_log = pd.concat([df_A_fit, df_B_fit], keys=['A', 'B'], names=['Disk']) # Pickle the step log df. pickle.dump(full_log, open('{}_step-log.pickle'.format(modelPath), "wb")) # To read the pickle: # f = pickle.load(open('{}_step-log.pickle'.format(modelPath), "rb")) # Finally, Create the final best-fit model and residuals print("\n\nCreating best fit model now") sample_model_in_uvplane(modelPath + '_bestFit', mol=mol) sample_model_in_uvplane(modelPath + '_bestFit', option='subtract', mol=mol) icr(modelPath + '_bestFit', mol=mol) icr(modelPath + '_bestFit_resid', mol=mol) print("Best-fit model created: " + modelPath + "_bestFit.im\n\n") # Calculate and present the final X2 values. finalX2s = chiSq(modelPath + '_bestFit', mol) print("Final Raw Chi-Squared Value: ", finalX2s[0]) print("Final Reduced Chi-Squared Value: ", finalX2s[1]) # Clock out t1 = time.time() t_total = (t1 - t0) / 60 # n+4 to account for best-fit model making and static disks in grid search t_per = str(t_total / (n + 4)) with open(modelPath + '_stepDurations.csv', 'w') as f: wr = csv.writer(f) wr.writerows(times) print("\n\nFinal run duration was", t_total / 60, ' hours') print('with each step taking on average', t_per, ' minutes') # log file w/ best fit vals, range queried, indices of best vals, best chi2 with open(modelPath + '_summary.log', 'w') as f: s0 = '\nLOG FOR RUN ON' + today + ' FOR THE ' + mol + ' LINE' s1 = '\nBest Chi-Squared values [raw, reduced]:\n' + str(finalX2s) s2 = '\n\n\nParameter ranges queried:\n' s3 = '\nDisk A:\n' for i, ps in enumerate(diskAParams): s3 = s3 + param_names[i] + str(ps) + '\n' s4 = '\nDisk B:\n' for i, ps in enumerate(diskBParams): s4 = s4 + param_names[i] + str(ps) + '\n' s5 = '\n\n\nBest-fit values (Tatm, Tqq, Xmol, outerR, PA, Incl):' s6 = '\nDisk A:\n' + str(fit_A_params) s7 = '\nDisk B:\n' + str(fit_B_params) s8 = '\n\n\nFinal run duration was' + str(t_total / 60) + 'hours' s9 = '\nwith each step taking on average' + t_per + 'minutes' s10 = '\n\nData file used was ' + dataPath s = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 f.write(s) run = GridSearch_Run(modelPath, save_all_plots=True) print("Successfully finished everything.")
def run_emcee(mol, lnprob, pool, resume_run=None): """ Make an actual MCMC run. Other than in setting up param_info, this is actually line-agnostic. The line-specificity is created in the lnprob function. Args: mol (str): which line we're running. lnprob (func): The lnprob function to feed emcee pool (): from_checkpoint (path): If we want to restart a dead run, give that run's name here (i.e. 'nov1-multi'). Assumes runs are located in /Volumes/disks/jonas/modeling/mcmc_runs/ param_info (list): list of [param name, initial_position_center, initial_position_sigma, (prior low bound, prior high bound)] for each parameter. The second two values set the position & size for a random Gaussian ball of initial positions """ if resume_run: run_name = resume_run run_path = './mcmc_runs/{}/'.format(run_name) print("Resuming old run at " + run_path) else: # Set up a run naming convension: run_name = today + '-' + mol run_name_basename = run_name run_path = './mcmc_runs/' + run_name_basename + '/' counter = 2 while already_exists(run_path) is True: run_name = run_name_basename + '-' + str(counter) run_path = './mcmc_runs/' + run_name + '/' counter += 1 print('Run path is {}'.format(run_path)) print("Setting up directories for new run") remove(run_path) sp.call(['mkdir', run_path]) sp.call(['mkdir', run_path + '/model_files']) # Make a copy of the initial parameter dict so we can modify it if mol is 'multi': sp.call([ 'cp', 'params-hco.json', '{}params-hco.json'.format(run_path) ]) sp.call([ 'cp', 'params-hcn.json', '{}params-hcn.json'.format(run_path) ]) else: sp.call([ 'cp', "params-" + mol + '.json', '{}params-{}.json'.format(run_path, mol) ]) # Note that this is what is fed to MCMC to dictate how the walkers move, not # the actual set of vars that make_fits pulls from. # ORDER MATTERS here (for comparing in lnprob) # Values that are commented out default to the starting positions in run_driver/param_dict # Note that param_info is of form: # [param name, init_pos_center, init_pos_sigma, (prior lower, prior upper)] if mol is 'multi': # There are more params to fit here. param_info = [ ('r_out_A_hco', 500, 300, (10, 700)), ('r_out_A_hcn', 500, 300, (10, 700)), ('atms_temp_A', 200, 150, (0, 1000)), ('mol_abundance_A_hco', -8, 3, (-13, -3)), ('mol_abundance_A_hcn', -8, 3, (-13, -3)), # ('mol_abundance_A_cs', -8, 3, (-13, -3)), ('temp_struct_A', -0., 1., (-3., 3.)), # ('incl_A', 65., 30., (0, 90.)), ('pos_angle_A', 70, 45, (0, 360)), ('r_out_B_hco', 500, 300, (10, 400)), ('r_out_B_hcn', 500, 300, (10, 400)), ('atms_temp_B', 200, 150, (0, 1000)), ('mol_abundance_B_hco', -8, 3, (-13, -3)), ('mol_abundance_B_hcn', -8, 3, (-13, -3)), # ('mol_abundance_B_cs', -8, 3, (-13, -3)), # ('temp_struct_B', 0., 1, (-3., 3.)), # ('incl_B', 45., 30, (0, 90.)), ('pos_angle_B', 136.0, 45, (0, 180)) ] # HCO+, HCN, or CS elif mol != 'co': param_info = [ ('r_out_A', 500, 300, (10, 700)), ('atms_temp_A', 300, 150, (0, 1000)), ('mol_abundance_A', -8, 3, (-13, -3)), ('temp_struct_A', -0., 1., (-3., 3.)), # ('incl_A', 65., 30., (0, 90.)), ('pos_angle_A', 70, 45, (0, 360)), ('r_out_B', 500, 300, (10, 400)), ('atms_temp_B', 200, 150, (0, 1000)), ('mol_abundance_B', -8, 3, (-13, -3)), # ('temp_struct_B', 0., 1, (-3., 3.)), # ('incl_B', 45., 30, (0, 90.)), ('pos_angle_B', 136.0, 45, (0, 180)) ] else: param_info = [ ('r_out_A', 500, 300, (10, 700)), ('atms_temp_A', 300, 150, (0, 1000)), ('m_disk_A', -1., 1., (-4.5, 0)), ('temp_struct_A', -0., 1., (-3., 3.)), # ('incl_A', 65., 30., (0, 90.)), ('pos_angle_A', 70, 45, (0, 180)), ('r_out_B', 500, 300, (10, 400)), ('atms_temp_B', 200, 150, (0, 1000)), ('m_disk_B', -4., 1., (-6., 0)), # ('temp_struct_B', 0., 1, (-3., 3.)), # ('incl_B', 45., 30, (0, 90.)), # ('pos_angle_B', 136.0, 45, (0, 180)) ] m = 'hco' if mol is 'multi' else mol with open('{}params-{}.json'.format(run_path, m), 'r') as f_base: # f = yaml.load(f_base, Loader=CLoader) f = json.load(f_base) nwalkers, nsteps = f['nwalkers'], f['nsteps'] # Set up initial positions if resume_run: chain_filename = '/Volumes/disks/jonas/modeling/mcmc_runs/{}/{}_chain.csv'.format( resume_run, resume_run) last_step = pd.read_csv(chain_filename).iloc[-nwalkers:] # .tolist() makes this into a list in the correct order # This might be backwards? Maybe need .iloc[-i] pos = [last_step.iloc[i].tolist() for i in range(nwalkers)] else: # Start a new file for the chain; set up a header line chain_filename = run_path + run_name + '_chain.csv' with open(chain_filename, 'w') as f: param_names = [param[0] for param in param_info] np.savetxt(f, (np.append(param_names, 'lnprob'), ), delimiter=',', fmt='%s') # randn randomly samples a normal distribution pos = [] for i in range(nwalkers): pos_walker = [] for param in param_info: pos_i = float(param[1] + param[2] * np.random.randn()) # Make sure we're starting within priors lower_bound, upper_bound = param[-1] while not lower_bound < pos_i < upper_bound: pos_i = float(param[1] + param[2] * np.random.randn()) pos_walker.append(pos_i) pos.append(pos_walker) # print("Positions: {}\n\n".format(pos)) # Initialize sampler chain # Recall that param_info is a list of length len(d1_params)+len(d2_params) print("Initializing sampler.") ndim = len(param_info) # Emcee v3 seems cool. Should upgrade: https://emcee.readthedocs.io/en/stable/user/upgrade/ # Most notable upgrade is backends: https://emcee.readthedocs.io/en/stable/tutorials/monitor/ # Have some useful implementation in old_run_driver.py, incl for schwimmbad. # Initialize a generator to provide the data. They changed the arg # storechain -> store sometime between v2.2.1 (iorek) and v3.0rc2 (cluster) from emcee import __version__ as emcee_version # iorek is on v2, cluster and kazul are v3 if emcee_version[0] == '2': # Initialize the sampler sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(run_path, param_info, mol), pool=pool) run = sampler.sample(pos, iterations=nsteps, storechain=False) # No backend here, so gotta do it manually. lnprobs = [] for i, result in enumerate(run): pos, lnprobs, blob = result # Log out the new positions with open(chain_filename, 'a') as f: new_step = [ np.append(pos[k], lnprobs[k]) for k in range(nwalkers) ] from datetime import datetime now = datetime.now().strftime('%H:%M, %m/%d') print("[{}] Adding a new step to the chain".format(now)) np.savetxt(f, new_step, delimiter=',') else: # for cluster and kazul # Can now tell walkers to move in different (not just stretch) ways # https://emcee.readthedocs.io/en/stable/user/moves/#moves-user # TODO: Look intio using other moves. move = emcee.moves.StretchMove # There is also now a default backend builtin filename = "tutorial.h5" #TODO: Update this backend = emcee.backends.HDFBackend(filename) backend.reset(nwalkers, ndim) sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(run_path, param_info, mol), pool=pool, moves=move, backend=backend) # Note that nsteps should be huge, since ideally we converge before hitting it. run = sampler.sample(pos, iterations=nsteps, progress=True) # Pulled from https://emcee.readthedocs.io/en/stable/tutorials/monitor/ # index = 0 # autocorr = np.empty(nsteps) autocorr = [] old_tau = np.inf for sample in run: # Only check convergence every 100 steps if sampler.iteration % 100: continue # Compute the autocorrelation time so far # Using tol=0 means that we'll always get an estimate even # if it isn't trustworthy tau = sampler.get_autocorr_time(tol=0) # autocorr[index] = np.mean(tau) autocorr.append(np.mean(tau)) # index += 1 # Check convergence converged = np.all(tau * 100 < sampler.iteration) converged &= np.all(np.abs(old_tau - tau) / tau < 0.01) if converged: break old_tau = tau print("Ended run")