Ejemplo n.º 1
0
    def test_make_results_for_models(self, model_dir, name):
        print(f'Model: {name}')
        model = cache.load_obj(f'{model_dir}/trained_model.pkl')
        plot_logloss_and_error(model, name=name, save_dir=model_dir)

        x, y, names = load_data()
        results = cache.load_obj(f'{model_dir}/cv_results.pkl')
        plot_fi(model, names, scale_name, sort=True, save_dir=model_dir)
        plot_cross_validation(results, name=name, save_dir=model_dir)
        print(results_to_print(results))

        _test_prediction(model, x, y)

        self.assertTrue(True)
def extract_features(csv_object):
    start = datetime.now()
    features = []
    times = []
    if isinstance(csv_object, CsvData):
        if csv_object.features_extracted:
            return True
        data = load_obj(csv_object.cached_data_path)
        for entry in data:
            if not isinstance(entry, DataEntry):
                logger.error(
                    f'Preprocessed data for {csv_object} on path {csv_object.cached_data_path} are broken!'
                )
                return False
            if not entry.accelerometer:
                break
            entry_features = entry.get_features()
            if csv_object.training_data:
                entry_features['SLEEP'] = entry.sleep
            features.append(entry_features)
            times.append(entry.time)
        df = DataFrame(features, index=times)
        df.to_excel(csv_object.features_data_path)
        csv_object.features_extracted = True
        csv_object.save()
        end = datetime.now()
        logger.info(
            f'Features for {csv_object.filename} extracted in {end - start}')
        return True
    else:
        return False
Ejemplo n.º 3
0
def create_structure():
    structure = []
    for subject in Subject.objects.all():
        sleep_days = SleepDiaryDay.objects.filter(subject=subject)
        if (sleep_days.exists()):
            for sleep_day in sleep_days:
                assert isinstance(sleep_day, SleepDiaryDay)
                data = CsvData.objects.filter(subject=subject)
                if not data.exists():  # no CSV data
                    logger.warning(
                        f'Missing csv data for subject {subject} with {len(sleep_days)} sleep diary days'
                    )
                else:
                    maching_data = None
                    if len(data) == 1:  # single CSV data file
                        matching_data = data.first()
                    else:  # data need to be found
                        s = sleep_day.t1
                        e = sleep_day.t4
                        for d in data:
                            assert isinstance(d, CsvData)
                            pred = cache.load_obj(d.cached_prediction_path)
                            interval = pred[s:e]
                            if len(interval) > 0:  # matchin data found
                                matching_data = d
                                break
                    if matching_data is None:
                        continue
                    structure.append((subject, matching_data, sleep_day))
                    logger.debug(
                        f'{subject.code} - {data.first().filename} - {sleep_day.date} added to validation structure '
                    )
    return structure
Ejemplo n.º 4
0
 def test_save(self):
     model = cache.load_obj(TRAINED_MODEL_PATH)
     x, y, names = load_data()
     if isinstance(model, xgb.sklearn.XGBClassifier):
         model._Booster.save_model(TRAINED_MODEL_EXPORT_PATH)
         _test_prediction(model, x, y)
     import_model = xgb.sklearn.XGBClassifier()
     import_model.load_model(TRAINED_MODEL_PATH)
     _test_prediction(import_model, x, y)
Ejemplo n.º 5
0
 def excel_prediction_url(self):
     if not path.exists(self.excel_prediction_path) and path.exists(
             self.cached_prediction_path):
         df = cache.load_obj(self.cached_prediction_path)
         df.to_excel(self.excel_prediction_path)
     elif path.exists(self.excel_prediction_path):
         return self.data.storage.url(self.excel_prediction_path)
     else:
         return ''
Ejemplo n.º 6
0
 def test_model_on_unknown_data(self, data_path):
     df = cache.load_obj(data_path)
     x = df[[c for c in df.columns if c != scale_name]].values
     predictions = self.model.predict(x)
     sleep = [x for x in predictions if x == 1]
     wake = [x for x in predictions if x == 0]
     r_sleep = len(sleep) / len(predictions)
     r_wake = len(wake) / len(predictions)
     print("Sleep: %.2f%%" % (r_sleep * 100.0))
     print("Wake: %.2f%%" % (r_wake * 100.0))
     self.assertTrue(1 > r_sleep > 0)
     self.assertTrue(1 > r_wake > 0)
Ejemplo n.º 7
0
def predict(csv_data, force=False):
    if isinstance(csv_data, CsvData):
        start = datetime.now()

        if os.path.exists(csv_data.cached_prediction_path) and not force:
            logger.info(
                f'Prediction features data for {csv_data.filename} will be loaded from cache'
            )
            df = cache.load_obj(csv_data.cached_prediction_path)
            return df

        else:
            logger.info(f'Data {csv_data.filename} need to be preprocessed')
            result = preprocess_data(csv_data)
            if not result:
                logger.warning(
                    f'Data {csv_data.filename} cannot be preprocessed')
                return None

            logger.info(
                f'Features for {csv_data.filename} need to be extracted')
            result = extract_features(csv_data)
            if not result:
                logger.warning(
                    f'Features cannot be extracted for {csv_data.filename}')
                return None

            df = pd.read_excel(csv_data.features_data_path, index_col=0)
            logger.info(f'Prediction need to be done for {csv_data.filename}')
            predictions = _predict(df)
            df[prediction_name] = predictions
            cache.save_obj(df, csv_data.cached_prediction_path)
            df.to_excel(csv_data.excel_prediction_path)
            csv_data.prediction_cached = True
            csv_data.save()

            end = datetime.now()
            logger.info(
                f'Prediction for {csv_data.filename} made in {end - start}')
            return df
    else:
        return None
Ejemplo n.º 8
0
 def setUpClass(cls):
     cls.model = cache.load_obj(TRAINED_MODEL_PATH)
Ejemplo n.º 9
0
def hilev():
    structure = create_structure()
    res = True
    for subject, data, day in structure:
        if not isinstance(data, CsvData) and path.exists(
                data.cached_prediction_path):
            res = False
            continue
        if not isinstance(day, SleepDiaryDay):
            res = False
            continue

        df = cache.load_obj(data.cached_prediction_path)
        if not isinstance(df, DataFrame):
            res = False
            continue
        nights = SleepNight.objects.filter(diary_day=day).filter(
            data=data).filter(subject=subject)
        if not nights.exists():
            night = SleepNight()
            night.diary_day = day
            night.data = data
            night.subject = subject
        else:
            night = nights.first()
        s = day.t1 - timedelta(minutes=30)
        e = day.t4 + timedelta(minutes=30)
        interval = df.loc[s:e, [prediction_name]]
        rolling_10 = interval.rolling('300s').sum()
        rolling_10['strict'] = numpy.where(rolling_10[prediction_name] <= 5,
                                           'W', 'S')
        sleep = rolling_10.index[rolling_10['strict'] == 'S'].tolist()
        if not sleep:
            logger.warning(
                f'No sleep found for {night.subject.code} {night.diary_day.date} {night.data.filename}'
            )
            res = False
            continue
        night.sleep_onset = pytz.timezone("Europe/Prague").localize(sleep[0])
        night.sleep_end = pytz.timezone("Europe/Prague").localize(sleep[-1])
        rolling_10[hilev_prediction] = numpy.where(
            rolling_10[prediction_name] <= 2, 'W', 'S')
        pred = rolling_10.loc[sleep[0]:sleep[-1], [hilev_prediction]]
        if not isinstance(pred, DataFrame):
            res = False
            continue
        pred.to_excel(night.name)
        wake = pred.index[pred[hilev_prediction] == 'W'].tolist()

        night.tst = (night.sleep_end - night.sleep_onset).seconds
        night.waso = len(wake) * 30
        night.se = ((night.tst - night.waso) / night.tst) * 100
        pred["number_prediction"] = numpy.where(pred[hilev_prediction] == 'S',
                                                1, 0)
        wakes_counts = (pred["number_prediction"].diff() == -1).sum()
        night.sf = wakes_counts / (night.convert(night.tst).seconds / 3600)
        onset_latency = sleep[0] - day.t1 if sleep[0] > day.t1 else timedelta(
            seconds=0)
        night.sol = onset_latency.seconds
        logger.info(night)
        night.save()
    return res
Ejemplo n.º 10
0
def learn():
    logger.info('Load the data')
    x, y, names = load_data()
    y = y.reshape((len(y), ))

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=17)
    y_train = y_train.reshape((len(y_train), ))

    logger.info('Original train data:}')
    log_data_info(y_train)

    # Add NaN according to K-nearest neighbours
    imputer = KNNImputer(n_neighbors=4, weights="uniform")
    x_train = imputer.fit_transform(x_train)

    # Add synthetic values to balance dataset
    sm = SMOTE(random_state=27)
    x_train, y_train = sm.fit_sample(x_train, y_train)

    logger.info('Data after SMOTE synthesis:}')
    log_data_info(y_train)

    if os.path.exists(MODEL_PATH):
        logger.info('Load model')
        model = load_obj(MODEL_PATH)
    else:
        if os.path.exists(HYPER_PARAMS_PATH):
            params = load_obj(HYPER_PARAMS_PATH)
        else:
            logger.info('Hyper-parameters tuning')
            params = _search_best_hyper_parameters(x_train, y_train)

        logger.info('Cross-validation of params')
        y_train = y_train.ravel()
        model = xgb.sklearn.XGBClassifier(**params)
        cv_results = evaluate_cross_validation(model=model,
                                               x_train=x_train,
                                               y_train=y_train,
                                               save_path=CV_RESULTS_PATH)
        logger.info(results_to_print(cv_results))

        plot_cross_validation(cv_results, 'Model binary:logistic')

    _train_model(model, x_test, x_train, y_test, y_train)

    save_obj(model, TRAINED_MODEL_PATH)

    # Plot the feature importances
    plot_fi(model, names, scale_name, sort=True, save_dir=ML_DIR)
    plot_logloss_and_error(model, model_name)

    predict = model.predict(x_test)
    logger.info('After training results on test data: ')
    logger.info(
        f'ACC: {accuracy_score(y_test, predict):.2f} | F1: {f1_score(y_test, predict):.2f}'
    )
    logger.info('Confusion matrix: ')
    logger.info(confusion_matrix(y_test, predict))

    predict = model.predict(x)
    logger.info('After training results on whole dataset: ')
    logger.info(
        f'ACC: {accuracy_score(y, predict):.2f} | F1: {f1_score(y, predict):.2f}'
    )
    logger.info('Confusion matrix: ')
    logger.info(confusion_matrix(y, predict))

    return model
Ejemplo n.º 11
0
def _predict(df):
    x = df[[c for c in df.columns if c != scale_name]].values
    model = cache.load_obj(TRAINED_MODEL_PATH)
    predictions = model.predict(x)
    return predictions