def _report(self, ref_set, sb): self._to_csv(os.path.join(self._root_dir, 'ref_set.csv'), ref_set) CSVFileWriter(os.path.join(self._root_dir, 'score_table.csv'), self._score_table) self._to_csv(os.path.join(self._root_dir, 'database.csv'), self._database) # delete the function reference from the dict and save the info to file func = deepcopy(sb._cm_functions) for f in func: del f['function'] CSVFileWriter(os.path.join(self._root_dir, 'cm_functions.csv'), func) # report memory usage mem_report = [{'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"), 'mem': memory()/float(1024**2), 'res_memory': res_memory(), 'total_memory': total_memory()}] memfile = os.path.join(self._root_dir, 'memfile.csv') if not os.path.exists(memfile): CSVFileWriter(memfile, mem_report) else: CSVFileWriter(memfile, mem_report, write_mode='a')
def _report(self, population, generation): best = population[0] rep = "[%s]Generation %d\n" % (self._max_cost_str, generation) rep += "[%s]Best solution:\n" % self._max_cost_str rep += "[%s]Pheno cost: %f\n" % (self._max_cost_str, best._pheno_scenario.get_cost()) rep += "[%s]%s" % (self._max_cost_str, str([x.get_value() for x in population])) logging.info(rep) entry = { 'generation': generation, 'pheno_cost': best._pheno_scenario.get_cost() } for model_name in self._model_names: models = [] for rep in best.get_reps(): models += [x for x in rep if x.get_method() == model_name] val = sum([x.get_rmse() for x in models]) / len(models) entry[model_name] = val entry['total'] = best.get_value() CSVFileWriter(self._report_file, [entry], self._report_header, 'a') meta = [] for individual in population: fname = os.path.join(self._root_dir, "%d.csv" % population.index(individual)) individual.get_pheno_scenario().to_csv(fname) for rep, rep_value in zip(individual.get_reps(), individual.get_rep_values()): meta_entry = { 'name': fname, 'value': individual.calc_value(), 'cost': individual.get_pheno_scenario().get_cost(), 'rep': individual.get_reps().index(rep) + 1 } for model in rep: meta_entry[model.get_method()] = model.get_rmse() meta.append(meta_entry) CSVFileWriter(os.path.join(self._root_dir, "meta.csv"), meta)
def _cache_save(self): import os from framework.util.csv_io.csv_filewriter import CSVFileWriter cache_fname = os.path.join(os.environ.get("HOME"), ".data_reader.cache") CSVFileWriter(cache_fname, self._combined_data)
def _train_process(self, data, instructions, model_cache, pname, meta_header, meta_location, lock, year): logging.info("%s starting to train %d submodels" % (pname, len(instructions))) for instruction in instructions: instruction['year'] = year model = MLModel(data, instruction) _, fname = mkstemp(dir=model_cache) f = open(fname, 'w') pickle.dump(model, f) f.close() logging.info("%s: %d out of %d" % (pname, instructions.index(instruction) + 1, len(instructions))) instruction['location'] = fname logging.info("%s locking before writing meta data" % pname) lock.acquire() logging.info("%s writing data" % pname) CSVFileWriter(meta_location, instructions, custom_header=meta_header, write_mode='a') logging.info("%s releasing lock" % pname) lock.release() logging.info("%s released lock" % pname) logging.info("%s finished" % pname)
def run(self, phenos, block_length, bootstrap_n): plots = set([x['plot'] for x in self._data]) result = [] for plot in plots: print "Plot %d" % plot for pheno in phenos: print "Pheno %s" % pheno all_measurements = [ x for x in self._data if x['plot'] == plot and x[pheno] != None ] #take means per date means = self.means_per_date(all_measurements, pheno, plot) #detrend the data detrended = self.detrend_data(means, pheno) #create the bootstrap blocks blocks = [] for i in range(len(detrended) - block_length + 1): block = [] for j in range(block_length): block.append(detrended[i + j]) blocks.append(block) #run bootstraps with different number of blocks for n in range(5, len(detrended) - block_length + 1): std_errs = [] #generate n number of bootstraps for i in range(bootstrap_n): bootstrap = [] for j in range(n): bootstrap += random.choice(blocks) std_errs.append( stats.sem([x[pheno] for x in bootstrap])) std_err = sum(std_errs) / len(std_errs) result.append({ 'plot': plot, 'pheno': pheno, 'n_blocks': n, 'standard_error': std_err, formats.GENOTYPE: detrended[0][formats.GENOTYPE] }) CSVFileWriter(self._f_location, result)
def calc_k(self, LAI_transmiss): temp_file = tempfile.mkstemp()[1] CSVFileWriter(temp_file, LAI_transmiss) robjects.r(''' calc_k_sep_r <- function(fname, genotype){ data <- read.csv(fname) data$Date <- as.Date(data$Date, "%Y-%m-%d %H:%M:%S") geno_sub <- subset(data, Genotype == genotype) years <- levels(factor(geno_sub$Year)) year_subs <- lapply(years, function(year) subset(geno_sub, Year == year)) k <- sapply(year_subs, function(year_sub) summary(nls(Transmission ~ exp(-k * LAI), data = year_sub, start = list(k = 1)))$parameters[1]) return(data.frame(as.integer(years), k)) } ''') robjects.r(''' calc_k_r <- function(fname, genotype){ data <- read.csv(fname) data$Date <- as.Date(data$Date, "%Y-%m-%d %H:%M:%S") geno_sub <- subset(data, Genotype == genotype) k <- summary(nls(Transmission ~ exp(-k * LAI), data = geno_sub, start = list(k = 1)))$parameters[1] return(k) } ''') calc_k_sep_r = robjects.r("calc_k_sep_r") calc_k_r = robjects.r("calc_k_r") genotypes = set([x['Genotype'] for x in LAI_transmiss]) result = [] for genotype in genotypes: df = calc_k_sep_r(temp_file, genotype) diff = math.fabs(df[1][0] - df[1][1]) if diff > 0.3: for year, k in zip(df[0], df[1]): result.append({'Genotype': genotype, 'Year': year, 'k': k}) else: #bulk the two years together k = calc_k_r(temp_file, genotype)[0] result.append({'Genotype': genotype, 'Year': '', 'k': k}) return result
def _check_memory(self): p = psutil.Process(os.getpid()) ram = p.memory_full_info()[7] logging.info("RAM: %s" % self.bytes2human(ram)) entry = { 'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"), 'ram': ram } CSVFileWriter(self._ram_file, [entry], ['date', 'ram']) # larger than 20G if ram >= 21474836480: return True else: return False
def run_scenario(self, phenos): plots = set([x[formats.UID] / 10 for x in self._data]) results = [] queue = Queue() for plot in plots: print "plot - %d" % plot for pheno in phenos: print "%s" % pheno genotype = misc.assign_geno_bsbec(plot) self.test_se(plot, genotype, pheno, queue) while not queue.empty(): results += queue.get() CSVFileWriter(self._f_location, results)
def _calc_coefficients(self, data): f = tempfile.NamedTemporaryFile() CSVFileWriter(f.name, data) cmd = [os.path.join(os.environ['R_SCRIPTS'], 'LER.R'), '-i', f.name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) out = proc.communicate()[0] result = dict() for entry in out.strip().split("\n"): content = entry.split(":") result[content[0]] = float(content[1]) # determine type of model if set(self.NLS_KEYS) == set(result.keys()): self._type = "NLS" elif set(self.LM_KEYS) == set(result.keys()): self._type = "LM" else: raise Exception("Unkown model type") return result
def calc_RUE(self, LER_dict, k_dict, location, LAI): met_data = MetDataReaderCSV(location).get_met_data() fll_reader = FLLReader(location) genotypes = set([x['Genotype'] for x in LER_dict]) destructive_phenos = CSVFileReader( location.get_destr_phenos()).get_content() for entry in destructive_phenos: entry['Date'] = datetime.strptime(entry['Date'], "%Y-%m-%d %H:%M:%S UTC") try: entry['fresh'] = float( entry['Fresh weight above ground material(g)']) entry['fresh_sub'] = float( entry['Fresh weight above ground sub-sample(g)']) entry['dry_sub'] = float( entry['dry weight above ground sub-sample(g)']) except ValueError: try: entry['dry_weight'] = float( entry['dry weight above ground sub-sample(g)']) except ValueError: pass continue if entry['fresh_sub'] == 0.0: entry['dry_weight'] = entry['dry_sub'] continue entry['dry_weight'] = entry['fresh'] * (entry['dry_sub'] / entry['fresh_sub']) destructive_phenos = [ x for x in destructive_phenos if 'dry_weight' in x ] #run the simulation per genotype RUE = [] for genotype in genotypes: geno_sub = [ x for x in destructive_phenos if x['Genotype'] == genotype ] dates = list(set([x['Date'] for x in geno_sub])) dates.sort() #create data point groups by dates that are close #to each other or the same groups = [] group_id = 0 for date in dates: for group in groups: delta = group['Dates'][0] - date days = math.fabs(delta.days) if days and days < 20: group['Dates'].append(date) break else: #create new group group = {'id': group_id, 'Dates': [date]} groups.append(group) group_id += 1 #get the mean dry weight per group mean_DW = [] #add entry for fll day fll_date = fll_reader.get_genotype_fll(genotype) mean_DW.append({'Date': fll_date, 'Yield': 0.0}) for group in groups: group_phenos = [ x for x in geno_sub if x['Date'] in group['Dates'] ] total_dw = 0.0 for entry in group_phenos: total_dw += entry['dry_weight'] total_dw /= float(len(group_phenos)) #correct the group date to the first one in the group mean_DW.append({ 'Date': sorted(group['Dates'])[0], 'Yield': total_dw }) #obtain genotype specific coefficients LER = [x for x in LER_dict if x['Genotype'] == genotype] LER.sort(key=lambda x: x['stage']) k = [x for x in k_dict if x['Genotype'] == genotype] if len(k) > 1: k = next(x['k'] for x in k if x['Year'] == location.get_year()) else: k = sorted(k, key=lambda x: x['Year'])[0]['k'] #simulate PAR and record values for days of destructive harvests real_LAI = [x for x in LAI if x['Genotype'] == genotype] mean_DW = self.simulate_PAR(k, LER, met_data, fll_date, mean_DW, real_LAI) #finally work out what the RUE is from #the real DMY and simulated PAR values temp_file = tempfile.mkstemp()[1] + genotype.split("-")[0] CSVFileWriter(temp_file, mean_DW) robjects.r(''' calc_RUE_r <- function(fname){ data <- read.csv(fname) data$Yield <- data$Yield * 2 fit <- lm(Yield ~ PAR + 0, data = data) return(summary(fit)$coefficients[1]) } ''') calc_RUE_r = robjects.r("calc_RUE_r") RUE_val = calc_RUE_r(temp_file)[0] RUE.append({'Genotype': genotype, 'RUE': RUE_val}) return RUE
def __init__(self, data): self._f = tempfile.NamedTemporaryFile() CSVFileWriter(self._f.name, data)
def _generate_submodels(self, model_cache, processes): """Generates R submodels""" init_vars = NaiveMLProcessModel._INIT_DATA instructions = [] variables = NaiveMLProcessModel._STAGE_ONE +\ NaiveMLProcessModel._STAGE_TWO for variable in variables: for method in Gene._METHOD_VALUES: other_vars = [x for x in NaiveMLProcessModel._STAGE_ONE if x != variable] i = 0 for i in range(len(other_vars) + 1): choices = combinations(other_vars, i) for choice in choices: train_vars = init_vars + list(choice) instruction = {'name': variable, 'method': method, 'variables': train_vars, 'stage': -999} instructions.append(instruction) process_instructions = self._process_split(instructions, processes) meta_header = ['name', 'method', 'variables', 'stage', 'location', 'year'] meta_location = os.path.join(model_cache, 'meta.csv') CSVFileWriter(meta_location, [], custom_header=meta_header) lock = Lock() for year in self._years: logging.info("Starting year %s" % year) new_data = split_dataset(self._data, year)['ml_data'] # each process_instruction is a list of instructions for that # process processes = [] for process_instruction in process_instructions: process_name = "Process %d-%d" %\ (year, process_instructions.index( process_instruction)) process = Process(target=self._train_process, name=process_name, args=(new_data, process_instruction, model_cache, process_name, meta_header, meta_location, lock, year)) process.start() processes.append(process) for process in processes: process.join() logging.info("Finished year %s" % year)
def _run_generation(self, population_size, population, process_count): if population == []: # first generation - generate new individuals logging.info("Generating initial population") required_count = population_size - len(population) population = self._generate_individuals(required_count, []) logging.info("Calculating RMSE") population = self._calculate_rmse_mp(population, process_count) population.sort(key=lambda x: x.get_rmse()) logging.info(population[0].get_rmse()) logging.info([x.get_rmse() for x in population]) logging.info("Saving population") # dump the whole population pop_meta = [] i = 1 for entry in population: CSVFileWriter(os.path.join(self._root_dir, "population", "%d.csv" % i), entry.get_instructions()) pop_meta.append({'name': '%d.csv' % i, 'rmse': entry.get_rmse()}) i += 1 CSVFileWriter(os.path.join(self._root_dir, "population", "meta.csv"), pop_meta) # append best RMSE to RMSE_history self._RMSE_history.append(population[0].get_rmse()) # keep top 20% keep = population[:int(self._keep_rate * (len(population)))] # cross between individuals for i in range(len(population)): population[i].set_probability(self._probabilities[i]) logging.info("Performing crosses") crosses_count = int(self._cross_rate * len(population)) crosses = self._perform_crosses(population, crosses_count, keep) logging.info("Generating immigrants") # bring some immigrants as well immigrant_count = int(self._immigration_rate * len(population)) immigrants = self._generate_individuals(immigrant_count, keep + crosses) # merge populations all_population = keep + crosses + immigrants final_population = [] # drop any duplicate individuals for entry in all_population: match = [x for x in final_population if x.same(entry)] if len(match) == 0: final_population.append(entry) else: logging.info("Duplicate individual found, removing...") # fill in gaps from dropped individuals with newly generated # individuals while len(final_population) < population_size: missing = population_size - len(final_population) logging.info("Generating %d individuals to fill in population" % missing) individuals = self._generate_individuals(missing, final_population) final_population += individuals return final_population
def _to_csv(self, fname, container): data = [x.to_dict() for x in container.get_all()] CSVFileWriter(fname, data)
def __init__(self, data, variables, population_size, model_names, root_dir, max_cost, scenario): """ data - full dataset variables - variables allowed to be used/dropped from models population_size - size of the population model_names - models to be used for evaluating each solution report_file - file to report best individual from each generation """ random.seed() for key in data: temp_data = [ x for x in data[key] if x[formats.DATE].month > 3 and x[formats.DATE].month < 11 ] data[key] = temp_data self._data = data self._variables = variables # fix missing data points ml_data = [] desired_vars = copy.deepcopy(self._variables) if scenario == 'simple_ml': desired_vars.append(formats.DW_PLANT) elif scenario == 'process_ml': pass else: raise Exception("Scenario %s is magnifico" % scenario) for entry in self._data['ml_data']: add = True for key in desired_vars: if entry[key] is None: add = False if add: ml_data.append(entry) self._data['ml_data'] = ml_data self._population = [] self._population_size = population_size self._model_names = model_names self._max_cost = max_cost self._max_cost_str = "{:.2f}".format(self._max_cost) self._stop_file = os.path.join(os.environ['HOME'], '.ga_signals', self._max_cost_str) self._best_models = [] # to be populated on each generation # model parameters self._keep_rate = 0.2 self._cross_rate = 0.6 self._immigration_rate = 0.2 self._mutation_rate = 0.2 if self._population_size == 10: self._probabilities = cdf(0.2, 1.2, population_size) # for 10 elif self._population_size == 25: self._probabilities = cdf(0.2, 1.2485, population_size) elif self._population_size == 50: self._probabilities = cdf(0.2, 1.249995, population_size) # for 50 else: import ipdb ipdb.set_trace() # set up report file self._root_dir = root_dir self._report_file = os.path.join(root_dir, 'report.csv') self._report_header = ['generation', 'pheno_cost'] self._report_header += model_names self._report_header.append('total') self._ram_file = os.path.join(root_dir, 'ram.csv') CSVFileWriter(self._ram_file, [], ['date', 'ram']) CSVFileWriter(self._report_file, [], self._report_header) logging.info("Calculating max RMSEs") self._max_rmse = self._get_max_rmse() self.run_model()