Esempio n. 1
0
    def _report(self, ref_set, sb):
        self._to_csv(os.path.join(self._root_dir, 'ref_set.csv'), ref_set)
        CSVFileWriter(os.path.join(self._root_dir, 'score_table.csv'),
                      self._score_table)
        self._to_csv(os.path.join(self._root_dir, 'database.csv'),
                     self._database)

        # delete the function reference from the dict and save the info to file
        func = deepcopy(sb._cm_functions)
        for f in func:
            del f['function']

        CSVFileWriter(os.path.join(self._root_dir, 'cm_functions.csv'), func)

        # report memory usage
        mem_report = [{'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
                       'mem': memory()/float(1024**2),
                       'res_memory': res_memory(),
                       'total_memory': total_memory()}]

        memfile = os.path.join(self._root_dir, 'memfile.csv')
        if not os.path.exists(memfile):
            CSVFileWriter(memfile, mem_report)
        else:
            CSVFileWriter(memfile, mem_report, write_mode='a')
Esempio n. 2
0
    def _report(self, population, generation):
        best = population[0]
        rep = "[%s]Generation %d\n" % (self._max_cost_str, generation)
        rep += "[%s]Best solution:\n" % self._max_cost_str
        rep += "[%s]Pheno cost: %f\n" % (self._max_cost_str,
                                         best._pheno_scenario.get_cost())

        rep += "[%s]%s" % (self._max_cost_str,
                           str([x.get_value() for x in population]))

        logging.info(rep)

        entry = {
            'generation': generation,
            'pheno_cost': best._pheno_scenario.get_cost()
        }

        for model_name in self._model_names:
            models = []
            for rep in best.get_reps():
                models += [x for x in rep if x.get_method() == model_name]

            val = sum([x.get_rmse() for x in models]) / len(models)
            entry[model_name] = val

        entry['total'] = best.get_value()

        CSVFileWriter(self._report_file, [entry], self._report_header, 'a')

        meta = []
        for individual in population:
            fname = os.path.join(self._root_dir,
                                 "%d.csv" % population.index(individual))

            individual.get_pheno_scenario().to_csv(fname)
            for rep, rep_value in zip(individual.get_reps(),
                                      individual.get_rep_values()):
                meta_entry = {
                    'name': fname,
                    'value': individual.calc_value(),
                    'cost': individual.get_pheno_scenario().get_cost(),
                    'rep': individual.get_reps().index(rep) + 1
                }

                for model in rep:
                    meta_entry[model.get_method()] = model.get_rmse()

                meta.append(meta_entry)

        CSVFileWriter(os.path.join(self._root_dir, "meta.csv"), meta)
Esempio n. 3
0
    def _cache_save(self):
        import os
        from framework.util.csv_io.csv_filewriter import CSVFileWriter

        cache_fname = os.path.join(os.environ.get("HOME"),
                                   ".data_reader.cache")
        CSVFileWriter(cache_fname, self._combined_data)
Esempio n. 4
0
    def _train_process(self, data, instructions, model_cache, pname,
                       meta_header, meta_location, lock, year):
        logging.info("%s starting to train %d submodels" %
                     (pname, len(instructions)))

        for instruction in instructions:
            instruction['year'] = year
            model = MLModel(data, instruction)
            _, fname = mkstemp(dir=model_cache)
            f = open(fname, 'w')
            pickle.dump(model, f)
            f.close()
            logging.info("%s: %d out of %d" %
                         (pname,
                          instructions.index(instruction) + 1,
                          len(instructions)))

            instruction['location'] = fname

        logging.info("%s locking before writing meta data" % pname)
        lock.acquire()
        logging.info("%s writing data" % pname)

        CSVFileWriter(meta_location,
                      instructions,
                      custom_header=meta_header,
                      write_mode='a')

        logging.info("%s releasing lock" % pname)
        lock.release()
        logging.info("%s released lock" % pname)
        logging.info("%s finished" % pname)
Esempio n. 5
0
    def run(self, phenos, block_length, bootstrap_n):
        plots = set([x['plot'] for x in self._data])
        result = []
        for plot in plots:
            print "Plot %d" % plot
            for pheno in phenos:
                print "Pheno %s" % pheno
                all_measurements = [
                    x for x in self._data
                    if x['plot'] == plot and x[pheno] != None
                ]

                #take means per date
                means = self.means_per_date(all_measurements, pheno, plot)

                #detrend the data
                detrended = self.detrend_data(means, pheno)

                #create the bootstrap blocks
                blocks = []
                for i in range(len(detrended) - block_length + 1):
                    block = []
                    for j in range(block_length):
                        block.append(detrended[i + j])

                    blocks.append(block)

                #run bootstraps with different number of blocks
                for n in range(5, len(detrended) - block_length + 1):
                    std_errs = []
                    #generate n number of bootstraps
                    for i in range(bootstrap_n):
                        bootstrap = []
                        for j in range(n):
                            bootstrap += random.choice(blocks)

                        std_errs.append(
                            stats.sem([x[pheno] for x in bootstrap]))

                    std_err = sum(std_errs) / len(std_errs)

                    result.append({
                        'plot':
                        plot,
                        'pheno':
                        pheno,
                        'n_blocks':
                        n,
                        'standard_error':
                        std_err,
                        formats.GENOTYPE:
                        detrended[0][formats.GENOTYPE]
                    })

        CSVFileWriter(self._f_location, result)
Esempio n. 6
0
    def calc_k(self, LAI_transmiss):
        temp_file = tempfile.mkstemp()[1]
        CSVFileWriter(temp_file, LAI_transmiss)

        robjects.r('''
			calc_k_sep_r <- function(fname, genotype){
				data <- read.csv(fname)
				data$Date <- as.Date(data$Date, "%Y-%m-%d %H:%M:%S")

				geno_sub <- subset(data, Genotype == genotype)
				years <- levels(factor(geno_sub$Year))
				year_subs <- lapply(years, function(year) subset(geno_sub, Year == year))
				k <- sapply(year_subs, function(year_sub)
								summary(nls(Transmission ~ exp(-k * LAI), 
										data = year_sub, 
										start = list(k = 1)))$parameters[1])

				return(data.frame(as.integer(years), k))
			}
			''')

        robjects.r('''
			calc_k_r <- function(fname, genotype){
				data <- read.csv(fname)
				data$Date <- as.Date(data$Date, "%Y-%m-%d %H:%M:%S")

				geno_sub <- subset(data, Genotype == genotype)
				k <- summary(nls(Transmission ~ exp(-k * LAI), 
										data = geno_sub, 
										start = list(k = 1)))$parameters[1]

				return(k)
			}
			''')

        calc_k_sep_r = robjects.r("calc_k_sep_r")
        calc_k_r = robjects.r("calc_k_r")

        genotypes = set([x['Genotype'] for x in LAI_transmiss])
        result = []
        for genotype in genotypes:
            df = calc_k_sep_r(temp_file, genotype)
            diff = math.fabs(df[1][0] - df[1][1])
            if diff > 0.3:
                for year, k in zip(df[0], df[1]):
                    result.append({'Genotype': genotype, 'Year': year, 'k': k})

            else:
                #bulk the two years together
                k = calc_k_r(temp_file, genotype)[0]
                result.append({'Genotype': genotype, 'Year': '', 'k': k})

        return result
Esempio n. 7
0
    def _check_memory(self):
        p = psutil.Process(os.getpid())
        ram = p.memory_full_info()[7]
        logging.info("RAM: %s" % self.bytes2human(ram))

        entry = {
            'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            'ram': ram
        }
        CSVFileWriter(self._ram_file, [entry], ['date', 'ram'])

        # larger than 20G
        if ram >= 21474836480:
            return True
        else:
            return False
Esempio n. 8
0
    def run_scenario(self, phenos):
        plots = set([x[formats.UID] / 10 for x in self._data])
        results = []
        queue = Queue()

        for plot in plots:
            print "plot - %d" % plot
            for pheno in phenos:
                print "%s" % pheno
                genotype = misc.assign_geno_bsbec(plot)
                self.test_se(plot, genotype, pheno, queue)

        while not queue.empty():
            results += queue.get()

        CSVFileWriter(self._f_location, results)
Esempio n. 9
0
    def _calc_coefficients(self, data):
        f = tempfile.NamedTemporaryFile()
        CSVFileWriter(f.name, data)

        cmd = [os.path.join(os.environ['R_SCRIPTS'], 'LER.R'), '-i', f.name]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out = proc.communicate()[0]

        result = dict()
        for entry in out.strip().split("\n"):
            content = entry.split(":")
            result[content[0]] = float(content[1])

        # determine type of model
        if set(self.NLS_KEYS) == set(result.keys()):
            self._type = "NLS"
        elif set(self.LM_KEYS) == set(result.keys()):
            self._type = "LM"
        else:
            raise Exception("Unkown model type")

        return result
Esempio n. 10
0
    def calc_RUE(self, LER_dict, k_dict, location, LAI):
        met_data = MetDataReaderCSV(location).get_met_data()
        fll_reader = FLLReader(location)
        genotypes = set([x['Genotype'] for x in LER_dict])

        destructive_phenos = CSVFileReader(
            location.get_destr_phenos()).get_content()
        for entry in destructive_phenos:
            entry['Date'] = datetime.strptime(entry['Date'],
                                              "%Y-%m-%d %H:%M:%S UTC")

            try:
                entry['fresh'] = float(
                    entry['Fresh weight above ground material(g)'])
                entry['fresh_sub'] = float(
                    entry['Fresh weight above ground  sub-sample(g)'])
                entry['dry_sub'] = float(
                    entry['dry weight above ground sub-sample(g)'])
            except ValueError:
                try:
                    entry['dry_weight'] = float(
                        entry['dry weight above ground sub-sample(g)'])
                except ValueError:
                    pass
                continue

            if entry['fresh_sub'] == 0.0:
                entry['dry_weight'] = entry['dry_sub']
                continue
            entry['dry_weight'] = entry['fresh'] * (entry['dry_sub'] /
                                                    entry['fresh_sub'])

        destructive_phenos = [
            x for x in destructive_phenos if 'dry_weight' in x
        ]

        #run the simulation per genotype
        RUE = []
        for genotype in genotypes:
            geno_sub = [
                x for x in destructive_phenos if x['Genotype'] == genotype
            ]
            dates = list(set([x['Date'] for x in geno_sub]))
            dates.sort()

            #create data point groups by dates that are close
            #to each other or the same
            groups = []
            group_id = 0
            for date in dates:
                for group in groups:
                    delta = group['Dates'][0] - date
                    days = math.fabs(delta.days)
                    if days and days < 20:
                        group['Dates'].append(date)
                        break
                else:
                    #create new group
                    group = {'id': group_id, 'Dates': [date]}
                    groups.append(group)
                    group_id += 1

            #get the mean dry weight per group
            mean_DW = []
            #add entry for fll day
            fll_date = fll_reader.get_genotype_fll(genotype)
            mean_DW.append({'Date': fll_date, 'Yield': 0.0})

            for group in groups:
                group_phenos = [
                    x for x in geno_sub if x['Date'] in group['Dates']
                ]
                total_dw = 0.0
                for entry in group_phenos:
                    total_dw += entry['dry_weight']

                total_dw /= float(len(group_phenos))

                #correct the group date to the first one in the group
                mean_DW.append({
                    'Date': sorted(group['Dates'])[0],
                    'Yield': total_dw
                })

            #obtain genotype specific coefficients
            LER = [x for x in LER_dict if x['Genotype'] == genotype]
            LER.sort(key=lambda x: x['stage'])
            k = [x for x in k_dict if x['Genotype'] == genotype]
            if len(k) > 1:
                k = next(x['k'] for x in k if x['Year'] == location.get_year())
            else:
                k = sorted(k, key=lambda x: x['Year'])[0]['k']

            #simulate PAR and record values for days of destructive harvests
            real_LAI = [x for x in LAI if x['Genotype'] == genotype]
            mean_DW = self.simulate_PAR(k, LER, met_data, fll_date, mean_DW,
                                        real_LAI)

            #finally work out what the RUE is from
            #the real DMY and simulated PAR values
            temp_file = tempfile.mkstemp()[1] + genotype.split("-")[0]
            CSVFileWriter(temp_file, mean_DW)
            robjects.r('''
				calc_RUE_r <- function(fname){
					data <- read.csv(fname)
					data$Yield <- data$Yield * 2
					fit <- lm(Yield ~ PAR + 0, data = data)
					return(summary(fit)$coefficients[1])
				}
				''')
            calc_RUE_r = robjects.r("calc_RUE_r")
            RUE_val = calc_RUE_r(temp_file)[0]
            RUE.append({'Genotype': genotype, 'RUE': RUE_val})

        return RUE
Esempio n. 11
0
 def __init__(self, data):
     self._f = tempfile.NamedTemporaryFile()
     CSVFileWriter(self._f.name, data)
Esempio n. 12
0
    def _generate_submodels(self, model_cache, processes):
        """Generates R submodels"""
        init_vars = NaiveMLProcessModel._INIT_DATA
        instructions = []
        variables = NaiveMLProcessModel._STAGE_ONE +\
            NaiveMLProcessModel._STAGE_TWO

        for variable in variables:
            for method in Gene._METHOD_VALUES:
                other_vars = [x for x in NaiveMLProcessModel._STAGE_ONE if
                              x != variable]
                i = 0
                for i in range(len(other_vars) + 1):
                    choices = combinations(other_vars, i)

                    for choice in choices:
                        train_vars = init_vars + list(choice)
                        instruction = {'name': variable,
                                       'method': method,
                                       'variables': train_vars,
                                       'stage': -999}

                        instructions.append(instruction)

        process_instructions = self._process_split(instructions, processes)

        meta_header = ['name', 'method', 'variables',
                       'stage', 'location', 'year']

        meta_location = os.path.join(model_cache, 'meta.csv')
        CSVFileWriter(meta_location, [], custom_header=meta_header)
        lock = Lock()

        for year in self._years:
            logging.info("Starting year %s" % year)
            new_data = split_dataset(self._data, year)['ml_data']
            # each process_instruction is a list of instructions for that
            # process
            processes = []
            for process_instruction in process_instructions:
                process_name = "Process %d-%d" %\
                               (year,
                                process_instructions.index(
                                        process_instruction))

                process = Process(target=self._train_process,
                                  name=process_name,
                                  args=(new_data,
                                        process_instruction,
                                        model_cache,
                                        process_name,
                                        meta_header,
                                        meta_location,
                                        lock, year))
                process.start()
                processes.append(process)

            for process in processes:
                process.join()

            logging.info("Finished year %s" % year)
Esempio n. 13
0
    def _run_generation(self, population_size, population, process_count):
        if population == []:
            # first generation - generate new individuals
            logging.info("Generating initial population")
            required_count = population_size - len(population)
            population = self._generate_individuals(required_count, [])

        logging.info("Calculating RMSE")
        population = self._calculate_rmse_mp(population, process_count)

        population.sort(key=lambda x: x.get_rmse())
        logging.info(population[0].get_rmse())
        logging.info([x.get_rmse() for x in population])

        logging.info("Saving population")
        # dump the whole population
        pop_meta = []
        i = 1
        for entry in population:
            CSVFileWriter(os.path.join(self._root_dir,
                                       "population",
                                       "%d.csv" % i),
                          entry.get_instructions())

            pop_meta.append({'name': '%d.csv' % i, 'rmse': entry.get_rmse()})
            i += 1

        CSVFileWriter(os.path.join(self._root_dir, "population", "meta.csv"),
                      pop_meta)

        # append best RMSE to RMSE_history
        self._RMSE_history.append(population[0].get_rmse())

        # keep top 20%
        keep = population[:int(self._keep_rate * (len(population)))]

        # cross between individuals
        for i in range(len(population)):
            population[i].set_probability(self._probabilities[i])

        logging.info("Performing crosses")
        crosses_count = int(self._cross_rate * len(population))
        crosses = self._perform_crosses(population, crosses_count, keep)

        logging.info("Generating immigrants")
        # bring some immigrants as well
        immigrant_count = int(self._immigration_rate * len(population))
        immigrants = self._generate_individuals(immigrant_count,
                                                keep + crosses)

        # merge populations
        all_population = keep + crosses + immigrants
        final_population = []

        # drop any duplicate individuals
        for entry in all_population:
            match = [x for x in final_population if x.same(entry)]
            if len(match) == 0:
                final_population.append(entry)
            else:
                logging.info("Duplicate individual found, removing...")

        # fill in gaps from dropped individuals with newly generated
        # individuals
        while len(final_population) < population_size:
            missing = population_size - len(final_population)
            logging.info("Generating %d individuals to fill in population" %
                         missing)
            individuals = self._generate_individuals(missing, final_population)
            final_population += individuals

        return final_population
Esempio n. 14
0
 def _to_csv(self, fname, container):
     data = [x.to_dict() for x in container.get_all()]
     CSVFileWriter(fname, data)
Esempio n. 15
0
    def __init__(self, data, variables, population_size, model_names, root_dir,
                 max_cost, scenario):
        """
        data - full dataset
        variables - variables allowed to be used/dropped from models
        population_size - size of the population
        model_names - models to be used for evaluating each solution
        report_file - file to report best individual from each generation
        """
        random.seed()

        for key in data:
            temp_data = [
                x for x in data[key]
                if x[formats.DATE].month > 3 and x[formats.DATE].month < 11
            ]
            data[key] = temp_data

        self._data = data
        self._variables = variables

        # fix missing data points
        ml_data = []

        desired_vars = copy.deepcopy(self._variables)
        if scenario == 'simple_ml':
            desired_vars.append(formats.DW_PLANT)
        elif scenario == 'process_ml':
            pass
        else:
            raise Exception("Scenario %s is magnifico" % scenario)

        for entry in self._data['ml_data']:
            add = True
            for key in desired_vars:
                if entry[key] is None:
                    add = False

            if add:
                ml_data.append(entry)

        self._data['ml_data'] = ml_data

        self._population = []
        self._population_size = population_size
        self._model_names = model_names
        self._max_cost = max_cost
        self._max_cost_str = "{:.2f}".format(self._max_cost)
        self._stop_file = os.path.join(os.environ['HOME'], '.ga_signals',
                                       self._max_cost_str)

        self._best_models = []  # to be populated on each generation

        # model parameters
        self._keep_rate = 0.2
        self._cross_rate = 0.6
        self._immigration_rate = 0.2
        self._mutation_rate = 0.2

        if self._population_size == 10:
            self._probabilities = cdf(0.2, 1.2, population_size)  # for 10
        elif self._population_size == 25:
            self._probabilities = cdf(0.2, 1.2485, population_size)
        elif self._population_size == 50:
            self._probabilities = cdf(0.2, 1.249995, population_size)  # for 50
        else:
            import ipdb
            ipdb.set_trace()

        # set up report file
        self._root_dir = root_dir
        self._report_file = os.path.join(root_dir, 'report.csv')
        self._report_header = ['generation', 'pheno_cost']
        self._report_header += model_names
        self._report_header.append('total')

        self._ram_file = os.path.join(root_dir, 'ram.csv')
        CSVFileWriter(self._ram_file, [], ['date', 'ram'])

        CSVFileWriter(self._report_file, [], self._report_header)

        logging.info("Calculating max RMSEs")
        self._max_rmse = self._get_max_rmse()
        self.run_model()