def main(cat, cat_name): data = cat['dat'] r, rbins = c.make_r_scale(.1, 20, 25) pair_proxies = ['c%.2f' % _ for _ in r] names = ['rhillmass', 'dm5e12', 's5', 'd1', 'Pall'] proxy_list = [['rhillmass'], ['d5e12', 'm5e12'], ['s5'], ['d1'], pair_proxies] predicted_ssfrs = [] for proxies, name in zip(proxy_list, names): data = util.load_proxies(data, 'data/' + cat_name + '/', proxies, proxies) features = proxies + ['mstar'] dtrain, dtest, regressor = model.trainRegressor(data, features) predicted_ssfrs.append(dtest['pred']) log_dir = util.get_logging_dir(cat_name) for proxies, pred, name in zip(proxy_list, predicted_ssfrs, names): dtest['pred'] = pred util.train_and_dump_rwp(data, proxies + ['mstar'], name + '.dat', '', cat['box_size'], cat['red_cut'], logging=False) util.train_and_dump_rwp_bins(data, proxies + ['mstar'], name + '.dat', '', cat['box_size'], num_splits=1, red_cut=cat['red_cut'], logging=False) xcf = c.cross_correlation_function(dtest, cat['red_cut'], box_size=cat['box_size']) util.dump_data(xcf, name + '_xcf.dat', log_dir) mcf = c.jackknife_mcf(dtest, box_size=cat['box_size']) util.dump_data(mcf, name + '_mcf.dat', log_dir) mlims = [(9.9, 10.1), (10.1, 10.3), (10.5, 10.7)] # change for illustris fnames = [''.join([name, '_conformity_', str(num), '.dat']) for num in [10.0, 10.2, 10.6]] for mlim, fname in zip(mlims, fnames): res = c.radial_conformity_wrapper(dtest, cat['box_size'], mlim[0], mlim[1]) util.dump_data(res, fname, log_dir)
def download_list(): proxies = util.load_proxies() for i in range(1,802): #802 url = 'http://www.cnhm.net/plant/index/page/%d' % (i) lfile = '%scnhm/list_%d.html' % (config.local_dir,i) proxy = random.choice(proxies).strip() print i,url,proxy respHtml = browser.downad_and_save(url,lfile) soup = BeautifulSoup(respHtml, fromEncoding=htmlCharset) table = soup.find('div',attrs = {'class': 'main'}) if not table: os.system('rm -f %s'%(lfile)) print 'err' continue xitems = [] tr_list = table.findAll('li') for tr in tr_list: plant_name = tr.find('div',attrs = {'class': 'PlantName'}) namea = plant_name.find('a') uid = util.completeInnerUrl(url,namea['href']) cn_name = namea['title'] lating_name = tr.find('div',attrs = {'class': 'LatinName'})['title'] section = tr.find('div',attrs = {'class': 'section'}).contents[0] genera = tr.find('div',attrs = {'class': 'genera'}).contents[0] xitems.append(','.join([uid,cn_name,lating_name,section,genera ])) with open('%scnhm/xlist_%s.csv'%(config.local_dir,i),'w') as f: f.write('\r\n'.join(xitems)+'\r\n') f.close()
def mtheater_all(): proxies = load_proxies() for i in range(1,6): try: # proxy = random.choice(proxies).strip() download_mtheater_list(i) except Exception,e: print i,'err',e
def download1(): proxies = util.load_proxies() rows = db.select('plants',what="pk_id,name_cn,source_uri" ) # ,offset=0,limit=1 for r in rows: proxy = random.choice(proxies).strip() print r.pk_id,r.source_uri,proxy try: download_detail(r.source_uri,proxy) except Exception,ex: print r.pk_id,r.source_uri,proxy,'err',ex
def load_words(): proxies = util.load_proxies() reader = csv.reader(file('%scnhm.csv'%(config.local_dir), 'rb')) for line in reader: proxy = random.choice(proxies).strip() pid = line[0].strip() #int() try: print pid,line[1],proxy download(line[0].strip(),line[1],proxy) except Exception,ex: print pid,'err',ex
import model, calc, util import plotting as p from collections import defaultdict import numpy as np cat = util.get_catalog('HW') dat = cat['dat'] halo_proxies = ['rhillmass', 'd5e12', 'm5e12'] dat = util.load_proxies(dat, 'data/HW/', halo_proxies, halo_proxies) _, rhill_df, _ = model.trainRegressor(dat, cat['box_size'], halo_proxies[:1]) _, dm5e12_df, _ = model.trainRegressor(dat, cat['box_size'], halo_proxies[1:]) mbins = [7e13, 1e14, 3e14] def get_richness_count(df, mbins, color='ssfr', red_cut=-11): hosts = df[df['upid'] == -1] host_id_to_host_mass = {_ for _ in zip(hosts['id'], hosts['mvir'])} count_bins = [[],[]] for hid, hmass in zip(hosts['id'], hosts['mvir']): if hmass < mbins[0] or hmass > mbins[-1]: continue subs = df[df['upid'] == hid] red_subs = len(subs[color] < red_cut) mbin_idx = np.digitize(hmass, mbins)[0] count_bins.append(red_subs) return count_bins hw_richness = get_richness_count(rhill_df, mbins) rhill_richness = get_richness_count(rhill_df, mbins, 'pred')
from sklearn import preprocessing from sklearn.gaussian_process import GaussianProcess import util import calc as c from sklearn.linear_model import Lasso import plotting as p cat = util.get_catalog("HW") dat = cat['dat'] proxies = ['rhillmass', 'd1', 'd2', 'd5', 'd10', 's1', 's2', 's5', 's10', 'd5e12', 'm5e12'] # TODO: make a log flag so that m5e12 gets log scaled as well dat = util.load_proxies(dat, 'data/HW/', proxies, proxies) features = proxies + ['mstar'] d_train, d_test = util.split_test_train(dat) Xtr, ytr, xtrsc, ytrsc = util.select_features(features, d_train, scaled=True) Xts, yts, xtssc, ytssc = util.select_features(features, d_test, scaled=True) #poly = preprocessing.PolynomialFeatures(degree=2) #Xtr_new = poly.fit_transform(Xtr) #Xts_new = poly.fit_transform(Xts) gp = GaussianProcess() gp.fit(Xtr, ytr) y_hat = gp.predict(Xts) #clf = Lasso(alpha=0.2) #clf.fit(Xtr_new, ytr)
import calc as c import util names = ['HW', 'Becker', 'Lu', 'Henriques', 'Illustris', 'EAGLE', 'MB-II'] data_dirs = [''.join(['data/', name, '/']) for name in names] cats = [util.get_catalog(name) for name in names] for name, cat, ddir in zip(names, cats, data_dirs): df = cat['dat'] print name print df.dtype.names proxies = ['d5e12', 'm5e12'] proxyname = 'dm5e12' df = util.load_proxies(df, ddir, proxies, proxies) print df.dtype.names features = proxies + ['mstar'] fname = 'dm5e12_all_2.dat' util.train_and_dump_rwp_bins(df, features, fname, proxyname, name, cat['box_size'], red_cut=cat['red_cut'], logging=True, num_splits=1)
import calc as c import util cat_name = 'HW' cat = util.get_catalog('HW') dat = cat['dat'] proxy_names = [ 'rhm3e12', 'rhm4e12', 'rhm5e12', 'rhm6e12', 'rhm7e12', 'rhm8e12', 'rhm9e12', 'rhm1e13', 'rhm2e13', 'rhm3e13'] names = [proxy + '.dat' for proxy in proxy_names] all_features = [util.load_feature_list(proxy, dat, cat) for proxy in proxy_names] for f_list in all_features: dat = util.load_proxies(dat, cat['dir'], f_list, f_list) for features, name, proxy in zip(all_features, names, proxy_names): print "training on: ", features + ['mstar've Oak Avenue, Apt. 4] util.train_and_dump_rwp_bins(dat, features + ['mstar'], name, proxy, cat_name, cat['box_size'], num_splits=3, logging=True)
from collections import defaultdict import calc, util, model import plotting as p p.set_plotting_context() names = ['HW', 'Becker', 'Lu', 'Henriques', 'EAGLE', 'Illustris', 'MB-II'] catalogs = [util.get_catalog(name) for name in names] data_dirs = ['data/' + name + '/' for name in names] dfs = [cat['dat'] for cat in catalogs] dfs = [util.load_proxies(df, ddir, ['rhillmass'], ['rhillmass']) for df, ddir in zip(dfs, data_dirs)] pred_dfs = [model.trainRegressor(df,['rhillmass', 'mstar'])[1] for df in dfs] mlims = [(9.9, 10.1), (10.1, 10.3), (10.5, 10.7)] msbin_catalog_dict = defaultdict(dict) for mlim, desc in zip(mlims, ['lo', 'mid', 'hi']): for name, df in zip(names, pred_dfs): msmin, msmax = mlim # density match(df, mlim) sel = np.where((df['mstar'] > msmin) & (df['mstar'] < msmax))[0] msbin_catalog_dict[name][desc] = df[sel] nrow = len(names) ncol = 3 i = 1 for name in names: for mbin, df in msbin_catalog_dict[name].items(): plt.subplot(nrow, ncol, i) rhillmass_values = df['rhillmass'] med = np.median(rhillmass_values)
import util import calc import model import matplotlib.pyplot as plt data_dir = 'data/Henriques' cat = util.get_catalog("Henriques") dat = cat['dat'] dat = util.load_proxies(dat, data_dir, ['s5'], ['s5']) df_train, df_test, _ = model.trainRegressor(dat, cat['box_size'], ['s5']) plt.hexbin(df_test['ssfr'], df_test['pred'])