Beispiel #1
0
def is_update_time():
    global last_update
    now_time = int(time.time())
    if (now_time - last_update) >= 60:
        logger.debug("update record time comes!")
        last_update = now_time
        return True
    return False
Beispiel #2
0
def is_mail_time(mail_flag=[True]):
    now_date = datetime.datetime.utcnow()
    if mail_flag[0] and now_date.hour == 2 and now_date.minute == 0:
        logger.debug("is mail time!")
        mail_flag[0] = False
        return True
    elif now_date.hour != 2 or now_date.minute != 0:
        mail_flag[0] = True

    return False
Beispiel #3
0
def is_submit_time(submit_flag=[True]):
    now_date = datetime.datetime.now()
    if submit_flag[0] and now_date.hour == 0 and now_date.minute == 0:
        logger.debug("is submit time!")
        submit_flag[0] = False
        return True
    elif now_date.hour != 0 or now_date.minute != 0:
        submit_flag[0] = True

    return False
Beispiel #4
0
def main():

    predict_start_date = datetime.datetime.strptime(PREDICT_START_DATE,
                                                    '%Y-%m-%d')
    predict_lag_start_date = predict_start_date - \
        datetime.timedelta(days=max(LAG_LENGTH, SLIDING_WINDOW_LENGTH))

    # Raw_data
    logger.debug('fetch data')
    sql_data = fetch_data()

    # filling missing value
    df = filling_missing_value(df)

    # lag features
    df = lagging(df, lag_len=LAG_LENGTH, lag_cols=LAG_COLS)
    # sliding features
    df = sliding_window(df,
                        sliding_window_len=SLIDING_WINDOW_LENGTH,
                        sliding_cols=SLIDING_COLS)

    logger.debug('sort and clean')
    # sort and clean
    df = df.dropna()
    df = df.sort_values(by=['storeId', 'dateTime'])
    df = df.drop_duplicates(subset=['storeId', 'dateTime'], keep='first')
    df = df.reset_index(drop=True)

    # record intermedia file
    df.to_csv(PREDICT_DATASET_DIR + 'predict_dataset.csv', index=False)

    # load pre trained models
    model_list = []
    for i in range(PREDICT_LENGTH):
        model_list.append(
            pickle.load(
                open(PREDICT_MODEL_DIR + 'best_model_' + str(i) + '.pkl',
                     'rb')))

    # predict
    store_list = future_frame.storeId.unique()
    for i in range(PREDICT_LENGTH):
        features = get_feature(df)
        label = model_list[i].predict(features)[0]
Beispiel #5
0
    def check(self):
        if not self._file_path or not self._log_name:
            logger.error("File path %s of %s was not initialed correctlly!"
                         % (self._file_path, self._log_name))
            return -1
        logger.debug("start parse %s in %s" %
                     (self._file_path, self._log_name))
        if not os.path.exists(self._file_path):
            logger.error("no exists file %s" % self._file_path)
            return -2

        try:
            file_stat = os.stat(self._file_path)
            if not stat.S_ISREG(file_stat.st_mode):
                logger.error("%s is not regular file" % self._file_path)
                return -2

            logger.debug("last inode:%d, last pos:%d" %
                         (self._inode, self._last_pos))
            if self._inode <= 0:
                self._inode = file_stat.st_ino
                self._last_pos = 0
                if self._fp:
                    self._fp.close()
                    self._fp = None
            elif self._inode != file_stat.st_ino:
                logger.info("File(%s)'s inode has been changed from %d to %d!"
                            % (self._file_path, self._inode, file_stat.st_ino))
                self._inode = file_stat.st_ino
                # here we can consider whether system archiving happened or someone remove the log, then do something appropriately
                # and now we just consider it system archiving
                self._last_pos = 0
                if self._fp:
                    self._fp.close()
                    self._fp = None
            if self._last_pos > file_stat.st_size:
                logger.info("File(%s)'s size has been changed from %d to %d!"
                            % (self._file_path, self._last_pos, file_stat.st_size))
                # here, may be system archiving happened or someone cut the
                # log, so the same as upstair.
                self._last_pos = 0
            elif self._last_pos < file_stat.st_size:
                # normal condition we come to here
                logger.debug("File(%s) size increase from %d to %d"
                             % (self._file_path, self._last_pos, file_stat.st_size))
                self.__read_content()

            return 0

        except Exception, e:
            logger.error("Exception accured during Check File %s! (%s)" %
                         (self._file_path, e))
            return -3
Beispiel #6
0
def main():

    # Valid period, Use the closest period of predict length to tune models.  [VALID_START_DATE, VALID_END_DATE)
    VALID_END_DATE = datetime.datetime.strptime(PREDICT_START_DATE, '%Y-%m-%d')
    VALID_START_DATE = VALID_END_DATE-datetime.timedelta(days=PREDICT_LENGTH)

    logger.debug('Get data')
    # fetch raw data
    df = fetch_data()
    df.to_csv(PREDICT_DATASET_DIR + 'raw_data.csv', index=False)

    logger.debug('Filling missing value, lag, sliding window')

    # filling missing value
    df = filling_missing_value(df)

    # lag features
    df = lagging(df,
                          lag_len=LAG_LENGTH,
                          lag_cols=LAG_COLS)
    # sliding features
    df = sliding_window(df,
                                 sliding_window_len=SLIDING_WINDOW_LENGTH,
                                 sliding_cols=SLIDING_COLS
                                 )

    logger.debug('filter peak periods')
    # filter peak periods
    df = filter_peak_period(df, PREDICT_START_DATE)

    logger.debug('sort and clean')
    # sort and clean
    df = df.dropna()
    df = df.sort_values(by=['storeId', 'dateTime'])
    df = df.drop_duplicates(subset=['storeId', 'dateTime'], keep='first')
    df = df.reset_index(drop=True)

    # record intermedia file
    df.to_csv(PREDICT_DATASET_DIR + 'datasets.csv', index=False)

    logger.debug('gen train datasets')
    # gen train datasets
    train_frame_dict = gen_datasets(
        df,
        PREDICT_DATASET_DIR,
        PREDICT_LENGTH,
        changing_cols=[])

    logger.debug('train valid split')
    # train valid split
    train_valid_dict = train_valid_split(
        train_frame_dict,
        VALID_START_DATE,
        PREDICT_TMP_DIR,
        PREDICT_LENGTH)

    logger.debug('grid search training')

    # grid search
    grid_search(
        VALID_START_DATE,
        PREDICT_LENGTH,
        PREDICT_TMP_DIR,
        PREDICT_MODEL_DIR,
        train_valid_dict,
        model_list=MODEL_LIST,
        verbose=2
    )