def generate_data(params, saveFileDescription = True): contacts = params.contacts_reader.get_contacts(params.interval,mindist=params.mindist,maxdist=params.maxdist) sample_size = min(params.sample_size,len(contacts)) logging.getLogger(__name__).info("Using sample size "+str(sample_size)) contacts_sample = contacts.sample(n=sample_size) assert len(contacts_sample) == sample_size generator = DataGenerator() result = generator.contacts2file(contacts_sample, params) if saveFileDescription: XML_report = generator.toXMLDict() write_XML(XML_report, header = params.out_file, fname = params.out_file+".xml") return result
def train(self, alg=xgboost.XGBRegressor(n_estimators=100), shortcut="model", apply_log=True, dump=True, out_dir="out/models/", weightsFunc=ones_like, classes_ratio=None, show_plot=True, *args, **kwargs): # Check that we have got data file try: self.input_file self.predictors except: raise Exception("Please read the data and set predictors first") # Save paramters to be able to hash model name self.predictors = sorted(self.predictors) self.alg = alg self.shortcut = shortcut self.apply_log = apply_log self.weightsFunc = weightsFunc self.out_dit = out_dir # remove validation data since we are going to dump instance and # do not want file to be large try: del self.validation_file del self.validation_data except: pass # First try to load model dump self.representation = str(self) dump_path = os.path.join(out_dir, self.representation) if os.path.exists(dump_path): logging.info("Found dump for model " + dump_path) return pickle.load(open(dump_path, "rb")) else: # read data self.input_data = self.read_file(self.input_file) if self.classes_ratio is not None: self.input_data = self.equalize_classes(self.input_data) self.input_data.fillna(value=0, inplace=True) self.contacts = np.array(self.input_data["contact_count"].values) # fit new model if apply_log: self.contacts = np.log(self.contacts) logging.getLogger(__name__).info("Fitting model") alg.fit(self.input_data[self.predictors], self.contacts, sample_weight=self.weightsFunc(self.contacts, self.input_data)) self.trained_model = alg if dump: logging.getLogger(__name__).info("Saving to file " + dump_path) #Remove large variables before dump del self.contacts del self.input_data pickle.dump(self, open(dump_path, "wb")) write_XML(self.toXMLDict(), str(self), dump_path + ".xml") logging.getLogger(__name__).info("Done") return self