Beispiel #1
0
def generate_data(params, saveFileDescription = True):
    contacts = params.contacts_reader.get_contacts(params.interval,mindist=params.mindist,maxdist=params.maxdist)
    sample_size = min(params.sample_size,len(contacts))
    logging.getLogger(__name__).info("Using sample size "+str(sample_size))
    contacts_sample = contacts.sample(n=sample_size)
    assert len(contacts_sample) == sample_size
    generator = DataGenerator()
    result = generator.contacts2file(contacts_sample, params)
    if saveFileDescription:
        XML_report = generator.toXMLDict()
        write_XML(XML_report,
                  header = params.out_file,
                  fname = params.out_file+".xml")
    return result
Beispiel #2
0
    def train(self,
              alg=xgboost.XGBRegressor(n_estimators=100),
              shortcut="model",
              apply_log=True,
              dump=True,
              out_dir="out/models/",
              weightsFunc=ones_like,
              classes_ratio=None,
              show_plot=True,
              *args,
              **kwargs):

        # Check that we have got data file
        try:
            self.input_file
            self.predictors
        except:
            raise Exception("Please read the data and set predictors first")

        # Save paramters to be able to hash model name
        self.predictors = sorted(self.predictors)
        self.alg = alg
        self.shortcut = shortcut
        self.apply_log = apply_log
        self.weightsFunc = weightsFunc
        self.out_dit = out_dir

        # remove validation data since we are going to dump instance and
        # do not want file to be large
        try:
            del self.validation_file
            del self.validation_data
        except:
            pass

        # First try to load model dump
        self.representation = str(self)
        dump_path = os.path.join(out_dir, self.representation)
        if os.path.exists(dump_path):
            logging.info("Found dump for model " + dump_path)
            return pickle.load(open(dump_path, "rb"))
        else:
            # read data
            self.input_data = self.read_file(self.input_file)
            if self.classes_ratio is not None:
                self.input_data = self.equalize_classes(self.input_data)
            self.input_data.fillna(value=0, inplace=True)
            self.contacts = np.array(self.input_data["contact_count"].values)

            # fit new model
            if apply_log:
                self.contacts = np.log(self.contacts)
            logging.getLogger(__name__).info("Fitting model")
            alg.fit(self.input_data[self.predictors],
                    self.contacts,
                    sample_weight=self.weightsFunc(self.contacts,
                                                   self.input_data))
            self.trained_model = alg
            if dump:
                logging.getLogger(__name__).info("Saving to file " + dump_path)
                #Remove large variables before dump
                del self.contacts
                del self.input_data
                pickle.dump(self, open(dump_path, "wb"))
                write_XML(self.toXMLDict(), str(self), dump_path + ".xml")

        logging.getLogger(__name__).info("Done")
        return self