Ejemplo n.º 1
0
def run():
    handler = EstimateTaskDurationsHandler()
    # handler.settimeout = ConfManage.getInt("THRIFT_TIMEOUT")
    processor = RProcessor(handler)
    transport = TSocket.TServerSocket(port=ConfManage.getInt("THRIFT_PORT"))
    tfactory = TTransport.TBufferedTransportFactory()
    pfactory = TBinaryProtocol.TBinaryProtocolFactory()
    # Start Server
    server = RTProcessPoolServer(processor, transport, tfactory, pfactory)
    print('Starting Thrift {} at port: {}'.format(server.__class__.__name__, ConfManage.getInt("THRIFT_PORT")))
    server.setClient()
    server.setNumThreads(ConfManage.getInt("THRIFT_THREAD_COUNT"))
    server.setNumWorkers(ConfManage.getInt("PROCESS_NUM"))

    def clean_shutdown(signum, frame):
        for worker in server.workers:
            requirements_logger.info('Terminating worker: %s' % worker)
            worker.terminate()
        requirements_logger.info('Requesting server to stop()')
        try:
            server.stop()
        except Exception:
            pass
    def set_alarm():
        signal.signal(signal.SIGALRM, clean_shutdown)
        signal.alarm(4)

    set_alarm()
    # 处理僵尸子进程:
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)
    server.serve()
Ejemplo n.º 2
0
 def __init__(self, client_class, host=None, port=None, timeout=None):
     host = host if host is not None else ConfManage.getString(
         "THRIFT_HOST")
     port = port if port is not None else ConfManage.getInt("THRIFT_PORT")
     timeout = timeout if timeout is not None else ConfManage.getInt(
         "THRIFT_TIMEOUT")
     socket = TSocket.TSocket(host, port)
     socket.setTimeout(timeout)
     self.transport = TTransport.TBufferedTransport(socket)
     protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
     # Create a client
     self.client = client_class.Client(protocol)
Ejemplo n.º 3
0
def get_run_time(args_date, shift_days=0, floored=True):
    run_time = None
    if args_date is not None and len(args_date) > 0:
        run_time = arrow.get(args_date).replace(tzinfo=ConfManage.getString(
            "ARROW_TIMEZONE"))  # .shift(hours=ENV_ARROW_TZSHIFT)
    else:
        run_time = arrow.now(tz=ConfManage.getString(
            "ARROW_TIMEZONE"))  # .shift(hours=ENV_ARROW_TZSHIFT)
    if floored:
        run_time = run_time.floor('day').floor('hour').floor('minute').floor(
            'second')
    run_time = run_time.shift(days=shift_days) if shift_days != 0 else run_time
    return run_time
Ejemplo n.º 4
0
 def __init__(self,
              host=ConfManage.getString("HBASE_HOST"),
              port=ConfManage.getInt("HBASE_PORT")):
     self.timezone = ConfManage.getString("ARROW_TIMEZONE")
     self.host = host
     self.port = port
     # self.connection = Connection(host=self.host,port=self.port,table_prefix=ConfManage.getString("HBASE_PREFIX"))
     self.connPool = RConnectionPool(
         size=ConfManage.getInt("HBASE_CONN_SIZE"),
         host=self.host,
         port=self.port,
         # timeout=10,
         table_prefix=ConfManage.getString("HBASE_PREFIX"))
Ejemplo n.º 5
0
def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))
        logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode))
        # 导入eta类:
        module_tmp = importlib.import_module('tools.eta.{}_{}'.format(
            estimator, predict_target))
        class_tmp = getattr(
            module_tmp, '{}{}'.format(estimator.capitalize(),
                                      predict_target.capitalize()))
        estimator_obj = class_tmp()

        # 数据处理
        data = estimator_obj.etl(data)
        # 去除异常值
        data = estimator_obj.filter_data(data)
        if data is not None and 'time' in data.columns:
            # 选取某段时间数据
            data = data.loc[(data.order_time > start_time)
                            & (data.order_time < run_time)]
            order_times = data.order_time
            interval_count = len(
                order_times.apply(
                    lambda order_time: order_time.date()).unique())
            logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \
                         order_times.min().format(loggable), order_times.max().format(loggable)))
            # 模型训练:
            estimator_obj.preprocess(data, mode, holdout)
            Logger.resource_checkpoint('post-preprocess')
        else:
            raise Exception(
                "Data not yet obtained. Please run `python collect.py` first!")
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(
            tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
    # Logger.release_instance()
    return 0
Ejemplo n.º 6
0
def init_pickle_cache():
    RELOAD_PICKLE_CACHE_KEY = ConfManage.getString("RELOAD_PICKLE_CACHE_KEY")
    logger.info('start init_pickle_cache RELOAD_PICKLE_CACHE_KEY={}'.format(
        RELOAD_PICKLE_CACHE_KEY))
    if RELOAD_PICKLE_CACHE_KEY is not None:
        result = reload_pickle_cache(RELOAD_PICKLE_CACHE_KEY)
        logger.info('init_pickle_cache result={}'.format(result))
Ejemplo n.º 7
0
 def __init__(self, end_point, use_ssl=False):
     self.end_point = end_point
     self.conn = requests.Session()
     if use_ssl:
         self.conn.mount(
             "https://",
             HTTPAdapter(
                 pool_connections=os.cpu_count() - 1,
                 pool_maxsize=ConfManage.getInt("HTTP_MAX_CONNECTIONS"),
                 max_retries=3))
     else:
         self.conn.mount(
             "http://",
             HTTPAdapter(
                 pool_connections=os.cpu_count() - 1,
                 pool_maxsize=ConfManage.getInt("HTTP_MAX_CONNECTIONS"),
                 max_retries=3))
Ejemplo n.º 8
0
 def set(self, key, data, age=ConfManage.getInt("CACHE_AGE")):
     """保存键为key的值,时间位age"""
     with self.lock:
         self.mem[key] = data
         if age == -1:
             self.time[key] = -1
         else:
             self.time[key] = time.time() + age + randrange(
                 start=0, stop=10, step=1)
         return True
Ejemplo n.º 9
0
    def __init__(self):
        sql_config = {
            'host': conf.MYSQL_HOST, 'port': conf.MYSQL_PORT,
            'user': conf.MYSQL_USER, 'pwd': conf.MYSQL_PASS,
            'dbname': self.__class__.db_name
        }

        self.sql_pool = create_engine("mysql+pymysql://{user}:{pwd}@{host}:{port}/{dbname}?charset=utf8"
            .format(**sql_config), max_overflow=0, pool_size=ConfManage.getInt("MYSQL_DB_CONNECTIONS"),
                                      poolclass=pool.QueuePool, pool_recycle=450)
Ejemplo n.º 10
0
def trim_outdated(logger, run_time, pickle_name):
    pickled = load_pickle(pickle_name)
    if pickled is None: return
    interval_begin = run_time.shift(
        days=-ConfManage.getInt("COLLECTION_INTERVAL"))
    pickled = pickled.loc[pickled['order_time'] > interval_begin.ceil('day')]
    if save_pickle(pickled, pickle_name):
        logger.info('Successfully Trimmed outdated! [ {} - {} ]'.format(
            interval_begin.shift(days=1).floor('day').format(loggable),
            run_time.format(loggable)))
Ejemplo n.º 11
0
    def toCache(self, cacheKey=None, age=ConfManage.getInt("CACHE_AGE")):
        def getData(func):
            def save(*args, **kwargs):
                defineKey = None
                if "cache_key" in kwargs:
                    defineKey = kwargs["cache_key"]
                actucal_key = defineKey if defineKey else cacheKey
                retry = 4
                while True:
                    data = self.client.get(actucal_key)
                    if data is None:
                        if self.client.set_mutex(actucal_key, 2):
                            try:
                                data = func(*args, **kwargs).to_json()
                                self.client.set(actucal_key, data, age)
                                self.client.delete(actucal_key + "_mutex")
                            except Exception:
                                self.client.delete(actucal_key + "_mutex")
                                raise
                            break
                        else:
                            time.sleep(0.5)
                            retry -= 1
                            if retry == 0:
                                logger.error(
                                    "Cache msg: Get cache data fail while retry 4 times"
                                )
                                raise Exception(
                                    "1302:Get cache data fail while retry 4 times"
                                )
                    else:
                        extime = self.client.ttl(actucal_key)
                        if extime <= 8:
                            if self.client.set_mutex(actucal_key, 2):
                                try:
                                    data = func(*args, **kwargs).to_json()
                                    self.client.set(actucal_key, data, age)
                                    self.client.delete(actucal_key + "_mutex")
                                except Exception:
                                    logger.error(
                                        "Cache msg:get {} failed, return old date"
                                        .format(kwargs["topic"]))
                                    self.client.delete(actucal_key + "_mutex")
                                    return data
                                break
                        else:
                            break
                return pandas.read_json(data)

            return save

        return getData
Ejemplo n.º 12
0
 def save_mode(self, realtime=None, postfix=None):
     if self.model is None:
         raise Exception("1602:Not model in Model, please use train member to produce model")
     else:
         if realtime is not None:
             estmator_key = '%s_%s_%s' % (self.estimator_name, realtime, self.target)
         else:
             estmator_key = '%s_%s' % (self.estimator_name, self.target)
         try:
             if self.estimator_name == "tf":
                 self.model.save("pickles/{app_mode}-{zone}-{estmator_key}".format(
                     app_mode=ConfManage.getString("APP_MODE"),
                     zone=ConfManage.getString("ZONE"),
                     estmator_key=estmator_key))
             else:
                 self.model.save_model(
                     "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                   zone=ConfManage.getString("ZONE"),
                                                                   estmator_key=estmator_key))
         except AttributeError:  # 非xgboost保存为pkl文件
             save_pickle(self.model, estmator_key + postfix, using_joblib=True)
         logger.info('Estmator Key: {}'.format(estmator_key))
Ejemplo n.º 13
0
def main():
    logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--date', help='日期', type=str)
    parser.add_argument('-p', '--pickle', type=str, help='数据集', default='data')
    parser.add_argument('estimator', help='算法选择', nargs='?', type=str, default='xgb')
    parser.add_argument('predict_target', help='目标值', nargs='?', type=str, default='accept')
    parser.add_argument('-f', '--feature-selected', help='特征值选择', action='store_true')
    parser.add_argument('-w', '--withhold', help='是否保存数据到bi数据库', action='store_true')
    parser.add_argument("-s", "--shift_days", help="The last few days", type=int, default=-1)
    args = parser.parse_args()
    logger.info('Arguments: estimator=%s, predict-target=%s, feature-selected=%r, withhold-bi-insertion=%r' % \
                (args.estimator, args.predict_target, args.feature_selected, args.withhold))
    try:
        process(logger, args.pickle, args.estimator, args.predict_target, args.withhold, args.date, args.shift_days)
    except TestDataEmpty:
        logger.error('Test Data Empty!')
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
Ejemplo n.º 14
0
    def cv(self, x_train, y_train, model=None, cv_round=5):
        """
        Args:
            x_train(DataFrame): 训练集
            y_train(DataFrame): 验证集
            model (str): 用于增量训练,原模型路径(数据量不大慎用,注意增加tree的数量对模型的影响)
        :return:
        """
        jobs = ConfManage.getInt("PARALLEL_NUM")

        if self.estimator_name == "xgb":
            import xgboost as xgb
            self.parameters["n_jobs"] = jobs
            dtrain = xgb.DMatrix(x_train, y_train)
            history = xgb.cv(self.parameters, dtrain, nfold=cv_round, metrics="rmse")
        else:
            verbose_level = ConfManage.getInt("LOG_LEVEL") if ConfManage.getInt("LOG_LEVEL") == logging.DEBUG else 0
            gsv = GridSearchCV(estimator=self.estimator, param_grid=self.parameters,
                               scoring='neg_mean_squared_error', verbose=verbose_level, n_jobs=jobs,
                               cv=cv_round)  # ‘neg_mean_squared_error’,neg_mean_absolute_error
            gsv.fit(x_train, y_train)
            history = gsv.cv_results_
        return history
Ejemplo n.º 15
0
 def lnglat_to_cellid(self, longitude, latitude):
     """坐标转s2 cellid"""
     LEVEL = ConfManage.getInt("S2_LEVEL")
     if latitude > 90 or latitude < -90:
         raise ValueError('4002:latitude out of range (-90,90)')
     elif longitude > 180 or longitude < -180:
         raise ValueError('4002:latitude out of range (-180,180)')
     elif LEVEL > 30:
         raise ValueError('4009:level must be litter than 30')
     else:
         latlng = LatLng.from_degrees(latitude, longitude)
         cell_id = CellId.from_lat_lng(latlng)
         level_cell_id = cell_id.parent(LEVEL)
         return level_cell_id.id()
Ejemplo n.º 16
0
    def load_model_cache(self, name='undefined', using_joblib=False):

        cache_key = 'pickle_cache_{}'.format(name)
        ret = self.cache.get(cache_key)
        if ret is None:
            logger.debug('load_pickle_cache, fetch from raw pickle')
            path = "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                     zone=ConfManage.getString("ZONE"),
                                                                     estmator_key=name)
            if name[:3] == "xgb":
                ret = xgb.Booster(model_file=path)
            elif name[:2] == "tf":
                ret = tf.keras.models.load_model(path, compile=False)
                ret.compile(optimizer=self.estimator().get_optimizer(), loss=self.estimator().loss_class,  # todo:self.estimator()未初始化设置
                            metrics=['mae', 'mse'])
        else:
            ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = self.cache.set(cache_key, ret, ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug('load_pickle_cache, set cache, cache_key={}, status={}'.format(cache_key, cached))
        else:
            logger.debug('load_pickle_cache, fetch from cache, cache_key={}'.format(cache_key))
        return ret
Ejemplo n.º 17
0
def multi_process(functions, args=(), kwds=[], processnum=None):
    # 多进程
    processnum = ConfManage.getInt("PROCESS_NUM") if processnum is None else processnum
    pool = Pool(processnum)
    func_length = len(functions)
    results = []
    for i in range(0, func_length):
        arg = () if len(args) == 0 else args[i]
        kwd = {} if len(kwds) == 0 else kwds[i]
        result = pool.apply_async(functions[i], args=arg, kwds=kwd)
        results.append(result)
    pool.close()
    pool.join()
    return results
Ejemplo n.º 18
0
def multi_thread(functions, args=[], kwds=[], threadnum=None):
    threadnum = ConfManage.getInt("PARALLEL_NUM") if threadnum is None else threadnum
    logger.debug('mulit_thread processes={}'.format(threadnum))
    pool = ThreadPool(threadnum)
    func_length = len(functions)
    results = []
    for i in range(0, func_length):
        arg = () if len(args) == 0 else args[i]
        kwd = {} if len(kwds) == 0 else kwds[i]
        result = pool.apply_async(functions[i], args=arg, kwds=kwd)
        results.append(result)
    pool.close()
    pool.join()
    return results
Ejemplo n.º 19
0
 def data_url(self,
              table,
              topic,
              start_time=None,
              end_time=None,
              columns=None,
              record_path=None,
              meta=None,
              timeout=10.,
              **kwargs):
     """
     调用API,可加table_name值查询,无法用于多条件查询
     :param table
     :param topic: string
     :param time_start: int eg:20190711000000 查询开始时间
     :param time_end: 查询终止时间
     :param kwargs: table_name值
     :return: pd.DataFrame
     """
     route = ConfManage.getString(
         "DATA_API_PREFIX") + '/' + table + '/' + topic
     for k, v in kwargs.items():
         if isinstance(v, list):
             object_id = ','.join([str(i) for i in v])
             route += '/' + object_id
         elif isinstance(v, int):
             object_id = str(v)
             route += '/' + object_id
     if start_time is not None and end_time is not None:
         params = remove_none(
             dict(time_start=dataApiTimeFmt(start_time),
                  time_end=dataApiTimeFmt(end_time)))
     else:
         params = None
     response = self.get(route=route, queries=params, timeout=timeout)
     response = json.loads(response) if isinstance(response,
                                                   (str,
                                                    bytes)) else response
     if response['error'] == 0:
         j2df = json_normalize(response['data'],
                               record_path=record_path,
                               meta=meta)
         if columns:
             try:
                 j2df = j2df.loc[:, columns]
             except KeyError:
                 return j2df
         return j2df
     else:
         raise DataApiException(response['err_msg'])
Ejemplo n.º 20
0
 def get_distance(self,
                  traffic,
                  starting_coordinates,
                  destination_coordinates,
                  timeout=None,
                  zone="",
                  **kwargs):
     if self.client is None:
         raise ConnectionNotEstablished(
             'OsrmApiClient is not yet initiated.')
     if not isinstance(starting_coordinates,
                       tuple) or len(starting_coordinates) != 2:
         raise ValueError(
             'Given param: `starting_coordinates` is of wrong type or not paired.'
         )
     if not isinstance(destination_coordinates,
                       tuple) or len(destination_coordinates) != 2:
         raise ValueError(
             'Given param: `destination_coordinates` is of wrong type or not paired.'
         )
     route = zone + ConfManage.getString(
         "OSRM_API_%s_ROUTE" % traffic.upper())
     route += '%.6f,%.6f;%.6f,%.6f' % ( \
         starting_coordinates[0], starting_coordinates[1],
         destination_coordinates[0], destination_coordinates[1]
     )
     if len(kwargs) > 0:
         params = kwargs
     else:
         params = None
     try:
         result = self.client.get(route, params, timeout=timeout)
     except Exception as err:
         logger.error('OsrmapiError link={}, Msg:{}'.format(
             ConfManage.getString("OSRM_API_ENDPOINT") + route, err))
         result = {"routes": [{"distance": -1}]}
     return result['routes'][0]["distance"]
Ejemplo n.º 21
0
def load_pickle_cache(name='undefined', using_joblib=False):
    cache_key = 'pickle_cache_{}'.format(name)
    ret = cache.get(cache_key)
    if ret is None:
        logger.debug('load_pickle_cache, fetch from raw pickle')
        ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = cache.set(cache_key, ret,
                               ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug(
                'load_pickle_cache, set cache, cache_key={}, status={}'.format(
                    cache_key, cached))
    else:
        logger.debug(
            'load_pickle_cache, fetch from cache, cache_key={}'.format(
                cache_key))
    return ret
Ejemplo n.º 22
0
    def train(self, x_train, y_train, cv_round=5, model=None):
        """
        Args:
            x_train(DataFrame): 训练集
            y_train(DataFrame): 验证集
            model (str): 用于增量训练,原模型路径(数据量不大慎用,注意增加tree的数量对模型的影响)
        :return:
        """
        jobs = ConfManage.getInt("PARALLEL_NUM")

        if self.estimator_name == "xgb":
            self.parameters["n_jobs"] = jobs
            if model is not None:
                self.parameters["n_estimators"] = 50
            self.model = self.estimator(**self.parameters).fit(x_train, y_train, xgb_model=model)
        else:
            self.model = self.estimator(**self.parameters).fit(x_train, y_train)
Ejemplo n.º 23
0
    def __init__(self):
        self.pool = redis.ConnectionPool(
            host=ConfManage.getString("REDIS_HOST"),
            port=ConfManage.getInt("REDIS_PORT"),
            db=ConfManage.getInt("REDIS_DB"),
            password=ConfManage.getString("REDIS_PASSWORD"),
            max_connections=ConfManage.getInt("REDIS_MAX_CONNECTIONS"),
            decode_responses=True,
            socket_keepalive=True)

        self.conn = redis.StrictRedis(connection_pool=self.pool,
                                      socket_connect_timeout=5)
        self.logger = Logger.get_instance(ConfManage.getString("LOG_REQ_NAME"))
Ejemplo n.º 24
0
class EtaMetricAll(Base):
    """Object Relational Model class used to establish connection to eta_accept_metrics table"""
    db_name = ConfManage.getString("BI_MYSQL_DBNAME")
    table_name = 'eta_metrics_all'
    columns_msg = {
        'id': ['integer unsigned', 'auto_increment primary key'],
        'prediction_date': ['date', 'not null'],
        'model': ['varchar(30)', 'not null'],
        'mae': ['float unsigned', 'not null'],
        'mse': ['float unsigned', 'not null'],
        'r2': ['float unsigned', 'not null'],
        'limit_N_percent': ['float unsigned', 'not null'],
        'valid_count': ['integer unsigned', 'not null'],
        'total_count': ['integer unsigned', 'not null']
    }

    def checkBidata(self, model, prediction_date):
        sql = "SELECT * FROM {table_name} WHERE prediction_date='{prediction_date}' AND model='{model}'" \
            .format(table_name=EtaMetricAll.table_name, prediction_date=prediction_date, model=model)
        df = read_sql(sql=sql, con=self.sql_conn)
        return True if len(df) != 0 else False
Ejemplo n.º 25
0
def timepiece(timeout=timeout, run=ConfManage.getBool("TIMEPIECE_RUN"), msg=0):
    def starttest(fun):
        def fun_run(*args, **kwargs):
            if run:
                starttime = time.time()
                res = fun(*args, **kwargs)
                endtime = time.time()
                totaltime = round(endtime - starttime, 4)
                if totaltime >= timeout:
                    logger.warning("FunTimeout({}s) funtion={}, msg:{}".format(
                        timeout, fun.__name__, kwargs if msg else None))
                else:
                    logger.debug(
                        "FunctionTime funtion={}, time: {}s, msg:{}".format(
                            fun.__name__, totaltime, kwargs if msg else None))
            else:
                res = fun(*args, **kwargs)
            return res

        return fun_run

    return starttest
Ejemplo n.º 26
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 收集实时数据
import argparse
import arrow
import pandas as pd
import traceback
from client.api_client import ApiClient
from configs.ConfManage import ConfManage
from tools.logger import Logger
from tools.pickler import delete_pickle, load_pickle, save_pickle
from tools.timer import get_run_time, LOGGABLE_ARROW_FORMAT as loggable

client = ApiClient()

ENV_ARROW_TIMEZONE = ConfManage.getString("ARROW_TIMEZONE")
ENV_ARROW_TZSHIFT = ConfManage.getInt("ARROW_TZSHIFT")
ENV_DATA_API_TIMERANGE = ConfManage.getInt("DATA_API_TIMERANGE")
ENV_ZONE = ConfManage.getString("ZONE")


def collect_batch_data(start_time, end_time, table, topic, columns=None):
    st = start_time
    et = end_time
    data_df = pd.DataFrame()
    hours_interval = int(
        24 /
        ENV_DATA_API_TIMERANGE) if 24 % ENV_DATA_API_TIMERANGE == 0 else 12
    while st < et:
        snt = st.shift(hours=hours_interval)
        data_df = data_df.append(
Ejemplo n.º 27
0
# coding=utf-8

from thrift import Thrift
from thrift.protocol import TBinaryProtocol
from thrift.Thrift import TApplicationException
from thrift.transport import TSocket, TTransport
from configs.ConfManage import ConfManage
from tools.logger import Logger

logger = Logger.get_instance(ConfManage.getString("LOG_BASE_NAME"))


class ThriftClient:
    '''thrift client'''
    def __init__(self, client_class, host=None, port=None, timeout=None):
        host = host if host is not None else ConfManage.getString(
            "THRIFT_HOST")
        port = port if port is not None else ConfManage.getInt("THRIFT_PORT")
        timeout = timeout if timeout is not None else ConfManage.getInt(
            "THRIFT_TIMEOUT")
        socket = TSocket.TSocket(host, port)
        socket.setTimeout(timeout)
        self.transport = TTransport.TBufferedTransport(socket)
        protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        # Create a client
        self.client = client_class.Client(protocol)

    def close(self):
        if self.transport.isOpen():
            self.transport.close()
            # logger.info('thrift transport IS CLOSED!')
Ejemplo n.º 28
0
def main():
    """Obtain Information from Data-API and MySQL Database"""
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--clear', \
                        help='Clear previously saved pickles.', action='store_true')
    parser.add_argument('-r', '--reverse', \
                        help='whether clear previously data.', action='store_true')
    parser.add_argument('-d', '--date', \
                        help='Date used for calculation.', type=str)
    parser.add_argument('-p', '--pickle', type=str, \
                        help='Pickle name for saving latest data-collection.', default='data')
    parser.add_argument('-u', '--updata', type=bool, \
                        help='Merge data with new feature to data.pkl.', default=False)
    parser.add_argument('-f', '--funtion', \
                        help='Update new feature from funtion.')
    parser.add_argument('-m', '--merge_on', \
                        help='Field names to join on. Must be found in both DataFrames.')
    args = parser.parse_args()

    # update date
    if args.updata:
        data = load_pickle(args.pickle)
        if args.funtion is not None and args.merge_on is not None:
            update_data(logger, args.funtion, data, args.merge_on, args.pickle)
        else:
            logger.error(
                'Funtion and Merge_on is None, Please provide corresponding parameters'
            )
        return

    # 清除所有子pkl。
    if args.clear:
        clear_pickles(logger)
        return

    is_reverse = False if args.reverse is None else True  # 向前或向后收集数据
    run_time = get_run_time(None, 0, False)
    logger.info('Run-Time: %s' % run_time)
    collect_date = None if args.date is None else get_run_time(args.date)
    logger.info('Collect-Date: %s' % collect_date)
    # 数据最后时间
    end_time = run_time.shift(days=-1).ceil('day')
    logger.info('End-Time: %s' % end_time)

    pickled = load_pickle(args.pickle)
    collected_start_time = None
    collected_count = 0
    if pickled is not None and isinstance(
            pickled, pd.DataFrame) and 'time' in pickled.columns:
        times = pickled['time']
        del pickled
        collected_count = len(
            times.apply(lambda order_time: order_time.date()).unique())
        collected_start_time = times.min()
        logger.info('Min collected order_time Date: %s' %
                    collected_start_time.format(loggable))
        collected_end_time = times.max()
        logger.info('Max collected order_time Date: %s' %
                    collected_end_time.format(loggable))

        if collect_date is not None:
            if collect_date > end_time:
                logger.warning(
                    'collect_date can not greater then end_time {} > {}'.
                    format(collect_date.format(loggable),
                           end_time.format(loggable)))
                return
            if collect_date < collected_start_time.floor('day'):
                start_time = collect_date.floor('day')
                end_time = collected_start_time.shift(days=-1).ceil('day')
            elif collect_date > collected_end_time.ceil('day'):
                start_time = collected_end_time.shift(days=1).floor('day')
                end_time = collect_date.ceil('day')
            else:
                logger.warning('collect_data invalid. {}'.format(collect_date))
        else:
            if collected_end_time >= end_time:
                logger.info('Targeted Run-Time already in Collection-Interval')
                return
            else:
                start_time = collected_end_time.shift(days=1).floor('day')

        gap = start_time.shift(days=-1).date() - end_time.date()
        gap_days = gap.days

    else:
        logger.info('Data empty!')
        gap_days = -ConfManage.getInt("COLLECTION_GAP")
        start_time = end_time.shift(days=gap_days + 1).floor('day')

    logger.info('Total Collection Interval: %d/%d [%s - %s]' %
                (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"),
                 start_time.format(loggable), end_time.format(loggable)))

    if gap_days >= 0:
        logger.info('Targeted Run-Time already in Collection-Interval')
        return

    logger.info('Gap: %d' % (gap_days))
    logger.info(
        'Gap Interval: %d [%s - %s]' %
        (gap_days, start_time.format(loggable), end_time.format(loggable)))
    try:
        # 针对缺失1天以上的数据进行每日收集
        for i in range(-gap_days, 0, -1):
            end_time = start_time.ceil('day')
            logger.info('Collecting data in [{} - {}]'.format(
                start_time.format(loggable), end_time.format(loggable)))
            collect(logger, start_time, end_time, args.pickle)
            logger.info('Success collect data in [{} - {}] \n\n'.format(
                start_time.format(loggable), end_time.format(loggable)))
            start_time = start_time.shift(days=1)
        trim_outdated(logger, run_time, args.pickle)  # 没有环境变量下,默认截取最近30天的数据
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at {}'.format(arrow.utcnow()))
    logger.info('Releasing Logger...')
    return 0
Ejemplo n.º 29
0
from tools.caster import chunks
from tools.pickler import delete_pickle, load_pickle, save_pickle
from tools.modeler import get_model
from tools.timer import get_run_time, LOGGABLE_ARROW_FORMAT as loggable
from tools.parallel import multi_thread

if sys.version_info[:2] in [(2, 6), (2, 7)]:
    reload(sys)
    sys.setdefaultencoding('utf-8')
elif sys.version_info[:2] in [(3, 6), (3, 7)]:
    # pylint: disable=E0401, E0611, E1101
    import importlib

    importlib.reload(sys)

ENV_ARROW_TIMEZONE = ConfManage.getString("ARROW_TIMEZONE")
ENV_ARROW_TZSHIFT = ConfManage.getInt("ARROW_TZSHIFT")
ENV_DATA_API_TIMERANGE = ConfManage.getInt("DATA_API_TIMERANGE")
ENV_ZONE = ConfManage.getString("ZONE")
logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
client = ApiClient()
osrm_api_client = OsrmApi()


def fetch_model_info(id_list, model_name='order', col=None, chunk_size=500):
    model = get_model(model_name)
    result = None
    id_chunks = chunks(id_list, chunk_size)
    for id_chunk in id_chunks:
        if result is None:
            result = model.fetch_in(
Ejemplo n.º 30
0
# VERSION: 	 1.0
# DESCRIPTION:
#   模型算法训练模块。
# *************************************************************
import argparse
import sys
import traceback
import arrow
from configs.ConfManage import ConfManage
from tools.logger import Logger
from tools.pickler import load_pickle
from tools.timer import LOGGABLE_ARROW_FORMAT as loggable
from tools.timer import get_run_time
import importlib
importlib.reload(sys)
logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))


def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))