def is_update_time(): global last_update now_time = int(time.time()) if (now_time - last_update) >= 60: logger.debug("update record time comes!") last_update = now_time return True return False
def is_mail_time(mail_flag=[True]): now_date = datetime.datetime.utcnow() if mail_flag[0] and now_date.hour == 2 and now_date.minute == 0: logger.debug("is mail time!") mail_flag[0] = False return True elif now_date.hour != 2 or now_date.minute != 0: mail_flag[0] = True return False
def is_submit_time(submit_flag=[True]): now_date = datetime.datetime.now() if submit_flag[0] and now_date.hour == 0 and now_date.minute == 0: logger.debug("is submit time!") submit_flag[0] = False return True elif now_date.hour != 0 or now_date.minute != 0: submit_flag[0] = True return False
def main(): predict_start_date = datetime.datetime.strptime(PREDICT_START_DATE, '%Y-%m-%d') predict_lag_start_date = predict_start_date - \ datetime.timedelta(days=max(LAG_LENGTH, SLIDING_WINDOW_LENGTH)) # Raw_data logger.debug('fetch data') sql_data = fetch_data() # filling missing value df = filling_missing_value(df) # lag features df = lagging(df, lag_len=LAG_LENGTH, lag_cols=LAG_COLS) # sliding features df = sliding_window(df, sliding_window_len=SLIDING_WINDOW_LENGTH, sliding_cols=SLIDING_COLS) logger.debug('sort and clean') # sort and clean df = df.dropna() df = df.sort_values(by=['storeId', 'dateTime']) df = df.drop_duplicates(subset=['storeId', 'dateTime'], keep='first') df = df.reset_index(drop=True) # record intermedia file df.to_csv(PREDICT_DATASET_DIR + 'predict_dataset.csv', index=False) # load pre trained models model_list = [] for i in range(PREDICT_LENGTH): model_list.append( pickle.load( open(PREDICT_MODEL_DIR + 'best_model_' + str(i) + '.pkl', 'rb'))) # predict store_list = future_frame.storeId.unique() for i in range(PREDICT_LENGTH): features = get_feature(df) label = model_list[i].predict(features)[0]
def check(self): if not self._file_path or not self._log_name: logger.error("File path %s of %s was not initialed correctlly!" % (self._file_path, self._log_name)) return -1 logger.debug("start parse %s in %s" % (self._file_path, self._log_name)) if not os.path.exists(self._file_path): logger.error("no exists file %s" % self._file_path) return -2 try: file_stat = os.stat(self._file_path) if not stat.S_ISREG(file_stat.st_mode): logger.error("%s is not regular file" % self._file_path) return -2 logger.debug("last inode:%d, last pos:%d" % (self._inode, self._last_pos)) if self._inode <= 0: self._inode = file_stat.st_ino self._last_pos = 0 if self._fp: self._fp.close() self._fp = None elif self._inode != file_stat.st_ino: logger.info("File(%s)'s inode has been changed from %d to %d!" % (self._file_path, self._inode, file_stat.st_ino)) self._inode = file_stat.st_ino # here we can consider whether system archiving happened or someone remove the log, then do something appropriately # and now we just consider it system archiving self._last_pos = 0 if self._fp: self._fp.close() self._fp = None if self._last_pos > file_stat.st_size: logger.info("File(%s)'s size has been changed from %d to %d!" % (self._file_path, self._last_pos, file_stat.st_size)) # here, may be system archiving happened or someone cut the # log, so the same as upstair. self._last_pos = 0 elif self._last_pos < file_stat.st_size: # normal condition we come to here logger.debug("File(%s) size increase from %d to %d" % (self._file_path, self._last_pos, file_stat.st_size)) self.__read_content() return 0 except Exception, e: logger.error("Exception accured during Check File %s! (%s)" % (self._file_path, e)) return -3
def main(): # Valid period, Use the closest period of predict length to tune models. [VALID_START_DATE, VALID_END_DATE) VALID_END_DATE = datetime.datetime.strptime(PREDICT_START_DATE, '%Y-%m-%d') VALID_START_DATE = VALID_END_DATE-datetime.timedelta(days=PREDICT_LENGTH) logger.debug('Get data') # fetch raw data df = fetch_data() df.to_csv(PREDICT_DATASET_DIR + 'raw_data.csv', index=False) logger.debug('Filling missing value, lag, sliding window') # filling missing value df = filling_missing_value(df) # lag features df = lagging(df, lag_len=LAG_LENGTH, lag_cols=LAG_COLS) # sliding features df = sliding_window(df, sliding_window_len=SLIDING_WINDOW_LENGTH, sliding_cols=SLIDING_COLS ) logger.debug('filter peak periods') # filter peak periods df = filter_peak_period(df, PREDICT_START_DATE) logger.debug('sort and clean') # sort and clean df = df.dropna() df = df.sort_values(by=['storeId', 'dateTime']) df = df.drop_duplicates(subset=['storeId', 'dateTime'], keep='first') df = df.reset_index(drop=True) # record intermedia file df.to_csv(PREDICT_DATASET_DIR + 'datasets.csv', index=False) logger.debug('gen train datasets') # gen train datasets train_frame_dict = gen_datasets( df, PREDICT_DATASET_DIR, PREDICT_LENGTH, changing_cols=[]) logger.debug('train valid split') # train valid split train_valid_dict = train_valid_split( train_frame_dict, VALID_START_DATE, PREDICT_TMP_DIR, PREDICT_LENGTH) logger.debug('grid search training') # grid search grid_search( VALID_START_DATE, PREDICT_LENGTH, PREDICT_TMP_DIR, PREDICT_MODEL_DIR, train_valid_dict, model_list=MODEL_LIST, verbose=2 )