Exemple #1
0
def get_or_create_session():
    """
    Retrieve the current active global session.

    If no active session exists, attempt to load config and create a new
    session.

    If an active session exists, return the session without loading new
    config.

    Returns
    -------
    session : Session
        The global active session
    """
    global _session
    if _session is not None and _session.is_active():
        _getLogger(__name__).debug(
            "Active session found, ignoring session kwargs")
    else:
        config = load_config()
        if config is None:
            print("WARN: Missing config")
            writer = WriterConfig(type="local",
                                  output_path="output",
                                  formats=["all"])
            config = SessionConfig("default-project", "default-pipeline",
                                   [writer], False)
        _session = session_from_config(config)
    return _session
Exemple #2
0
def get_or_create_session(path_to_config: Optional[str] = None, report_progress: Optional[bool] = False):
    """
    Retrieve the current active global session.

    If no active session exists, attempt to load config and create a new
    session.

    If an active session exists, return the session without loading new
    config.

    :return: The global active session
    :rtype: Session
    :type path_to_config: str
    """
    global _session
    if _session is not None and _session.is_active():
        _getLogger(__name__).debug("Active session found, ignoring session kwargs")
    else:
        config = load_config(path_to_config)
        if config is None:
            print("WARN: Missing config")

            config = SessionConfig(
                "default-project",
                "default-pipeline",
                [WriterConfig(type="local", output_path="output", formats=["all"])],
                MetadataConfig(type="local", output_path="output", input_path=""),
                False,
            )
        if report_progress is not None:
            config.report_progress = report_progress

        _session = session_from_config(config)
    return _session
def get_lang_for_preparetext(text, main_language="en")->str:
    try:
        lang = detect(text)
        if lang not in _lang_names.keys():
            lang = main_language
        return lang
    except:
        _getLogger(__name__).warning('no features in text')
        return main_language
def get_lang(text, main_language="en")->str:
    try:
        lang = detect(text)
        if lang not in _mongo_langs:
            lang = main_language
        return lang
    except:
        _getLogger(__name__).warning('no features in text')
        return main_language
def stemme_text(text: str, returnList=True, lang=None):
    try:
        lang = lang if lang != "none" else detect(comment)
    except  LangDetectException as e:
        _getLogger(__name__).warning(str(e))
        pass
    words = tokenize(text)
    words=stemme_words(words,lang)
    if returnList:
        return words
    else:
        return " ".join(words)
def SweepingTest(metrics=None, endpoint=None, batchSize=1, printVolume=10000):
    logger = _getLogger(__name__)
    #logger.info("in SweepingTest")#
    st = _time.time()
    allR = []
    printFactor = 0
    for queryIndex in range(0, len(metrics), batchSize):
        printFactor += batchSize
        allR.append(
            _requests.get(
                '%s/api/query?start=2018/04/25-00:00:00&end=2018/06/12-00:00:00&%s'
                % (endpoint, "&".join(
                    metrics[queryIndex:queryIndex + batchSize]))))
        if printFactor > printVolume:
            logger.info(queryIndex)
            logger.info(
                '%s/api/query?start=2018/04/25-00:00:00&end=2018/06/12-00:00:00&%s'
                % (endpoint, "&".join(
                    metrics[queryIndex:queryIndex + batchSize])))
            et = _time.time()
            logger.info(et - st)
            logger.info("total throughput %f" % ((et - st) /
                                                 (queryIndex + batchSize)))
            logger.debug(allR[-1].text)
            printFactor = 0
    et = _time.time()
    throughput = (et - st) / (len(metrics))
    logger.info("throughput=%f" % (throughput))
    return throughput, allR[-1]
Exemple #7
0
def get_or_create_session():
    """
    Retrieve the current active session.  If no active session is found,
    create the session.
    """
    global _session
    if _session is not None and _session.is_active():
        _getLogger(__name__).debug(
            'Active session found, ignoring session kwargs')
    else:
        config = load_config()
        if config is not None:
            writer = WriterConfig(type='local', output_path='output', formats=['all'])
            config = SessionConfig('default-project', 'default-pipeline', False, [writer])
        _session = session_from_config(config)
    return _session
Exemple #8
0
    def __init__(
        self,
        project: str,
        pipeline: str,
        writers: List[Writer],
        metadata_writer: Optional[MetadataWriter] = None,
        verbose: bool = False,
        with_rotation_time: str = None,
        cache_size: int = None,
        report_progress: bool = False,
    ):
        self._py_logger = _getLogger(__name__)
        if writers is None:
            writers = []
        self.project = project
        self.pipeline = pipeline
        self.writers = writers
        self.metadata_writer = metadata_writer
        self.verbose = verbose
        self._active = True
        self._loggers = {}
        self._session_time = datetime.datetime.now()
        self._session_id = str(uuid4())
        self._config = SessionConfig(project, pipeline, writers, metadata_writer, verbose)
        self.with_rotation_time = with_rotation_time
        self.cache_size = cache_size
        self.report_progress = report_progress

        # enable special logic when starting/closing a Session if we're using whylabs client to save dataset profiles
        whylabs_writer_is_present = any(isinstance(w, WhyLabsWriter) for w in self.writers)
        self.use_whylabs_writer = _use_whylabs_client or whylabs_writer_is_present

        # add WhyLabs writer if it's not already present (which can happen if it's not specified in the config)
        if _use_whylabs_client and whylabs_writer_is_present is False:
            self.writers.append(WhyLabsWriter(output_path=None, formats=["protobuf"]))
Exemple #9
0
    def _getLogger(name=None):
        logger = logging._getLogger(name)
        if not logger.handlers:
            handler = StreamHandler()
            logger.addHandler(handler)

        return logger
Exemple #10
0
def configure_logger_level():
    """
    When called this will set the root logger level based
    on the ``agent_global_logger_level`` configuration
    variable.
    """
    # Import here to prevent circular imports and because we
    # don't want CONFIGURATION in the namespace of this module.
    from pyfarm.agent.logger.twistd import CONFIGURATION

    root_level = config["agent_global_logger_level"]

    if isinstance(root_level, STRING_TYPES):
        root_level = _levelNames[root_level.upper()]

    assert isinstance(root_level, int)

    levels = CONFIGURATION["levels"]
    for index, (name, level) in enumerate(levels):
        if name == "":
            levels[index] = ("", root_level)
            break
    else:
        levels.insert(0, ("", root_level))

    # Just to be safe, we also set pf's root level
    pf = _getLogger("pf")
    pf.setLevel(root_level)
Exemple #11
0
    def _getLogger(name=None):
        logger = logging._getLogger(name)
        if not logger.handlers:
            handler = StreamHandler()
            logger.addHandler(handler)

        return logger
def writeDataFrameToOpenTsdb(df=None,
                             valueColumns=None,
                             groupColumns=None,
                             apiEntryPoint='cviadqat07.office.comscore.com',
                             putApiEndPoint=None,
                             assignApiEndPoint=None,
                             port=None,
                             metric=None,
                             host_tag=False,
                             check_tsdb_alive=True,
                             send_metrics_batch_limit=50,
                             tagsToKeep=None,
                             max_queue_size=50000,
                             compressTags=False,
                             overrideMillisecond=False):
    """store dataframe into tsdb via client"""
    logger = _getLogger(__name__)

    if not (metric and metric.strip()):
        raise Exception('Metric must not be empty')

    result = putAPIOpentsdb(df=df,
                            valueColumns=valueColumns,
                            groupColumns=groupColumns,
                            metric=metric,
                            putApiEndPoint=putApiEndPoint,
                            tagsToKeep=tagsToKeep,
                            compressTags=compressTags,
                            overrideMillisecond=overrideMillisecond)
    return result
    def ad_processor(self,
                     input_data,
                     parameter_dict=None,
                     agg_col_name=None,
                     default_result=None):
        logger = _getLogger(__name__)

        nan_map = input_data.isnull()
        result = {}
        default_result['timestamp'] = input_data.index.max()
        _STATUS_CODE = AlphaResultObject.STATUS_CODE

        if nan_map.agg('sum') == 0:
            if input_data.shape[0] > 1:
                result = self.apply_prediction(data=input_data,
                                               agg_col_name=agg_col_name,
                                               param_dict=parameter_dict)
                if not isinstance(result, dict):
                    raise TypeError(
                        'Return type of apply_prediction should be a dict() object'
                    )
                else:
                    if set(result.keys()).issubset(
                            AlphaResultObject().metric_column_list):
                        raise ValueError(
                            'Result dictionary key set should contain all elements of AlphaResultObject().metric_column_list'
                        )
            else:
                result[_STATUS_CODE] = 2
        else:
            result[_STATUS_CODE] = 1

        result_dict = {**default_result, **result}

        return result_dict
def readData(filename):
    """open filename and return binary data"""
    logger = _getLogger(__name__)
    logger.info("in readData")
    data = None
    with open(filename, 'rb') as fid:
        data = fid.read()

    return data
def prepare_text(comment: str, lang=None)->list:
    print('prepar text called')
    # comment["lang"] is none if it's not supported by mongodb
    # we have to detect language to remove stope words and stemme
    try:
        lang = get_lang_for_preparetext(comment)
    except  LangDetectException as e:
        _getLogger(__name__).warning(str(e))
        pass
    # tokenize comment text
    words = tokenize(comment)
    # remove numbers and punctuation
    filtred_words = remove_punctuation(words)
    # remove stop words
    filtred_words = remove_stop_words(filtred_words, lang)
    # stemme words
    if lang!='ar':
        filtred_words = stemme_words(filtred_words, lang)
    return filtred_words
def findStartTime(timeColumnsDict=None, numberPoints=0, timePeriod='D'):
    """given time columns, end time and numberPoints return the python date time object"""
    logger = _getLogger(__name__)
    logger.info("in findStartTime")
    if 'month_id' in timeColumnsDict:
        logger.info("found month_id")
        timestamp = _CSTimeToDatetime(month_id=timeColumnsDict['month_id'] -
                                      numberPoints)
    elif 'week_id' in timeColumnsDict:
        logger.info("found week_id")
        timestamp = _CSTimeToDatetime(week_id=timeColumnsDict['week_id'] -
                                      numberPoints)
    elif 'hour_id' in timeColumnsDict and 'time_id' in timeColumnsDict:
        logger.info("found time_id and hour_id")
        timestamp = _CSTimeToDatetime(time_id=timeColumnsDict['time_id'],
                                      hour_id=timeColumnsDict['hour_id'] -
                                      numberPoints)
    elif 'time_id' in timeColumnsDict:
        logger.info("found time_id ")
        timestamp = _CSTimeToDatetime(time_id=timeColumnsDict['time_id'] -
                                      numberPoints)
    elif len(timeColumnsDict) == 1:  # assume it is already a timestamp
        #for single column it is assumed that the time column is seconds since epoch

        secondsSinceEpoch = timeColumnsDict[next(iter(timeColumnsDict))]
        utcDateTime = _datetime.fromtimestamp(secondsSinceEpoch, _timezone.utc)
        time_id = 0
        hour_id = 0
        week_id = 0
        month_id = 0
        #set the 1 that should be in the delta based on the time parameter
        needDelta = True
        if timePeriod == 'daily':
            time_id = -1 * numberPoints
        elif timePeriod == 'hourly':
            hour_id = -1 * numberPoints
        elif timePeriod == 'monthly':
            month_id = -1 * numberPoints
        elif timePeriod == 'weekly':
            week_id = -1 * numberPoints
        elif timePeriod == 'unknown':
            needDelta = False
            logger.warn("unknown time type defaulting to start time as epoch")
            timestamp = _datetime(1970, 1, 1, 0, 0, 0, 0, _pytz.UTC)
        else:
            needDelta = False
            logger.warn(
                "unsupported time type defaulting to start time as epoch")
            timestamp = _datetime(1970, 1, 1, 0, 0, 0, 0, _pytz.UTC)
        if needDelta:
            timestamp = utcDateTime + _timedelta(
                days=time_id,
                hours=hour_id,
            ) + _relativedelta(months=month_id, weeks=week_id)
    return timestamp
def SweepingMetaTest(metaDF=None,
                     totalRows=None,
                     endpointsMap=None,
                     batchSize=1,
                     printVolume=10000,
                     testNumber=2,
                     additionalSearchParameters='',
                     tagsToKeep=None):
    logger = _getLogger(__name__)
    metaInfo = []
    for endpoint in endpointsMap:
        logger.info('using %s' % (endpoint[0]))
        try:
            _requests.get('%s/api/dropcaches' % (endpoint[0]))
            df = metaDF['df']
            timeColumns = metaDF['time_columns']
            valueColumns = metaDF['value_columns']
            groupColumns = metaDF['group_columns']
            pj = [
                generateOpentsdbJsonPayloadAsMetrics(
                    fields=dict(row[valueColumns]),
                    metric=endpoint[testNumber],
                    tags=dict(row[groupColumns]),
                    time=row['timestamp'],
                    tagsToKeep=tagsToKeep)
                for index, row in df[:totalRows].iterrows()
            ]
            if tagsToKeep is None:
                metrics = [
                    'm=none:%s%s' %
                    (additionalSearchParameters, elem['metric'])
                    for elem in _itertools.chain.from_iterable(pj)
                ]
            else:
                metrics = [
                    'm=none:%s%s%s' %
                    (additionalSearchParameters, elem['metric'],
                     buildTagSearchFromKV(elem['tags'], ))
                    for elem in _itertools.chain.from_iterable(pj)
                ]

            logger.info(pj[0][0])
            logger.info(metrics[0])
            results = SweepingTest(metrics=metrics,
                                   endpoint=endpoint[0],
                                   batchSize=batchSize,
                                   printVolume=printVolume)
            metaInfo.append(results)
        except _requests.exceptions.ConnectionError as e:
            logger.info("Catching Connection Error: %s: process next record" %
                        (str(e)))
            metaInfo.append([e])
            continue
    return metaInfo
def decodeData(inputData, encoding):
    """decode data using encoding to return string data"""
    logger = _getLogger(__name__)
    logger.info("in decodeData")
    data = None
    try:
        data = inputData.decode(encoding).strip()
    except Exception as e:
        logger.error("Exception in decode:%s" % (e))
        raise (e)

    return data
def putAPIOpentsdb(df=None,
                   valueColumns=None,
                   groupColumns=None,
                   metric=None,
                   tagsToKeep=None,
                   putApiEndPoint=None,
                   compressTags=False,
                   overrideMillisecond=False,
                   timestampCol='timestamp',
                   tolerance=0.1):

    logger = _getLogger(__name__)
    pj = get_metric_names(dataframe=df,
                          group_columns=groupColumns,
                          value_columns=valueColumns,
                          metric_prefix=metric,
                          tagsToKeep=tagsToKeep)

    pjAll = list(_itertools.chain.from_iterable(pj))

    result = list()

    if len(pjAll) > 0:
        if logger.isEnabledFor(_DEBUG):
            logger.debug(len(pjAll))
            logger.debug(len(pjAll[0]))
            logger.debug(type(pjAll[0]))
            logger.debug(pjAll[0])
        else:
            if _randint(1, 10001) < 10:

                logger.info(len(pjAll))
                logger.info(len(pjAll[0]))
                logger.info(type(pjAll[0]))
                logger.info(pjAll[0])

    else:
        logger.info('number of metrics generated are 0')
        logger.info('Not calling put API, returning empty list')
        return result

    session = retry_session(retries=5)
    r = session.post(url=putApiEndPoint, data=_json.dumps(pjAll))  #, timeout=8
    result.append(r.status_code)
    successful_puts = sum([elem < 300 for elem in result])
    logger.info('number 204:%d' % (successful_puts))

    if successful_puts == 0:
        raise ValueError('put api call was unsucessful')
    return result
def json_parser(json_file=None, business_id=None, group_id=None, database_id=None, **kwargs):

    master_json_tmp = master_json_file + '.tmp'
    master_json_bak = master_json_file + '.bak'
    remove_bak_file = True
    remove_tmp_file = False

    shutil.copy2(master_json_file, master_json_bak)
    logger = _getLogger(__name__)

    if business_id is not None or group_id is not None or database_id is not None:
        meta_info = {}
        meta_info[business_id] = {}
        meta_info[business_id][group_id] = {}

    else:
        raise ValueError('business_id, group_id or database_id cannot be null')

    if json_file is not None:
        with open (json_file, 'r') as f_h:
            parameter_dict = _json.load(f_h)
            meta_info[business_id][group_id][database_id] = parameter_dict
    else:
        meta_info[business_id][group_id][database_id] = {'influxAPI':None,'influxPort':None, 'opentsdbAPI':None,'opentsdbPort':None, 'tableName':None, 'anomaly_package': None, 'anomaly_class': None,
                                                         'apply_AD_cols': None,  'time_range': 45, 'threshold_dict':None, 'parameter_dict':None}

    with open(master_json_file, 'r') as f_h:
        existing_config = _json.load(f_h)
        new_config = {**existing_config, **meta_info}

    with open(master_json_tmp, 'w') as f_h:
        _json.dump(new_config, f_h)

    try:
        shutil.move(master_json_tmp, master_json_file)
        logger.info('New Json file created successfully')

    except Exception as e:
        logger.debug('Following error occured when creating the new json file %s' % e)
        remove_bak_file = False
        remove_tmp_file = True

    finally:
        if remove_tmp_file:
            os.remove(master_json_tmp)
        if remove_bak_file:
            os.remove(master_json_bak)
Exemple #21
0
def getLogger(name, parent=None):
    if isinstance(parent, LoggerAdapter):
        klass = type(parent)
        extra = parent.extra
        parent = parent.logger
    else:
        klass = None
        extra = None

    if parent:
        name = parent.name + '.' + name
    logger = _getLogger(name)
    logger.settings = settings

    if extra:
        logger = klass(logger, extra)
        logger.settings = settings
    return logger
Exemple #22
0
def getLogger(name, parent=None):
    if isinstance(parent, LoggerAdapter):
        klass = type(parent)
        extra = parent.extra
        parent = parent.logger
    else:
        klass = None
        extra = None

    if parent:
        name = parent.name + '.' + name
    logger = _getLogger(name)
    logger.settings = settings

    if extra:
        logger = klass(logger, extra)
        logger.settings = settings
    return logger
def writeDataFrameToInfluxDB(df=None,
                             valueColumns=None,
                             groupColumns=None,
                             apiEntryPoint='cviadqat07.office.comscore.com',
                             port=8086,
                             database='Panel_Only_Mobile_Data',
                             measurement=None,
                             username='',
                             password='',
                             timestampCol=['timestamp']):
    """store dataframe into tsdb via client"""
    logger = _getLogger(__name__)
    logger.info("in writeDataFrameToInfluxDB")

    restructured_df = df[groupColumns + valueColumns + timestampCol]
    num_group_cols = len(groupColumns)
    num_value_cols = len(valueColumns)
    value_col_start_index = num_group_cols + 1
    value_col_end_index = num_group_cols + num_value_cols

    #create a client connection
    client = _InfluxDBClient(apiEntryPoint, port, username, password, database)
    #create the db if needed
    client.create_database(database)

    #verify that we have a valid measurement name
    if not (measurement and measurement.strip()):
        raise Exception('Measurement must not be empty')

    #add the dataframe row by row, store the result of each load
    result = []
    # Using itertuples as it gives speed improvement over iterrows.
    # increase depends on size of the input df but for O(1000) can improve performance by 2x for O(1e5) can improve by 100x
    for tup in restructured_df.itertuples():
        pj = generateInfluxJsonPayload(fields=(dict(
            zip(valueColumns,
                tup[value_col_start_index:value_col_end_index + 1]))),
                                       measurement=measurement,
                                       tags=dict(
                                           zip(groupColumns,
                                               tup[1:num_group_cols + 1])),
                                       time=int(tup[-1]))
        result.append(client.write_points(pj, time_precision='s'))
    return result
Exemple #24
0
async def load(manifest_path: Union[_Path, None]) -> Config:
    """Configuration loader function

    This architecture balances readiness for async-requiring configuration fetches (e.g. a cloud credential store),
    with convenience of having synchronously constructed configuration objects. This way our configuration model
    classes can be nicely statically typed, at the cost of needing to make sure we have all the necessary data
    loaded & ready in time for synchronous cascading __init__ calls.
    """
    raw = {}
    if (manifest_path):
        print("Loading configuration manifest %s", manifest_path)
        with manifest_path.open() as manifest_file:
            raw = _safe_load(manifest_file)
    else:
        print("No configuration manifest supplied - using environment variables only")
    raw["env"] = _environ
    APPCONFIG = Config(raw)
    LOGGER = _getLogger(__name__)
    LOGGER.info(APPCONFIG)
    return APPCONFIG
Exemple #25
0
def syslog_handlers(logger_name,
                    address=('127.0.0.1', 514),
                    facility=0,
                    level='DEBUG'):
    global _root_logger_name, root_logger, _stdout
    logger_names = []
    loggers = []
    if type(logger_name) in [str]:
        logger_names = [logger_name]
    elif type(logger_name) in [list, tuple]:
        logger_names = logger_name
    else:
        return loggers
    for name in logger_names:
        logger = _getLogger(name)
        _level = logger_filters.get(name) or level
        _level = _level.upper()
        if _level == 'OFF':
            handler = NullHandler()
            logger.addHandler(handler)
        elif _level in ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']:
            del logger.handlers[:]
            logger.setLevel(_level)
            handler = SysLogHandler(address=tuple(address),
                                    facility=SysLogHandler.LOG_LOCAL0 +
                                    facility)
            handler.setLevel(_level)
            logger.addHandler(handler)
            if _stdout:
                handler = StdoutHandler()
                handler.setLevel(_level)
                logger.addHandler(handler)
        logger.propagate = 0
        if name == _root_logger_name:
            root_logger = logger
        loggers.append(logger)
    return loggers
def normalizeTimeStamps(df=None, time_columns=None):
    """take in comscore time ids or assume time column is a timestamp and return a timestamp series"""
    logger = _getLogger(__name__)
    logger.info("in normalizeTimeStamps")
    timestamps = None
    # overall policy is to check for month id and if found use it alone
    # otherwise if time_id and hour_id present use those
    # otherwise use time_id else ensure time column is len 1, and assume it is already a timestamp
    if 'month_id' in time_columns:
        logger.info("found month_id")
        timestamps = df.month_id.apply(
            lambda z: _CSTimeToDatetime(month_id=z).timestamp())
    elif 'week_id' in time_columns:
        logger.info("found week_id")
        timestamps = df.week_id.apply(
            lambda z: _CSTimeToDatetime(week_id=z).timestamp())
    elif 'hour_id' in time_columns and 'time_id' in time_columns:
        logger.info("found time_id and hour_id")
        timestamps = df[['time_id', 'hour_id']].apply(
            lambda row: _CSTimeToDatetime(time_id=row['time_id'],
                                          hour_id=row['hour_id']).timestamp(),
            axis=1)
    elif 'time_id' in time_columns:
        logger.info("found time_id ")
        timestamps = df[['time_id']].apply(lambda row: _CSTimeToDatetime(
            time_id=row['time_id'], hour_id=0).timestamp(),
                                           axis=1)
    elif len(time_columns) == 1:  # assume it is already a timestamp
        timestamps = df[time_columns[0]].astype(int)
    else:
        raise Exception('Unrecognized time format')

    # log diagnostic type info
    if len(timestamps) > 0:
        logger.info("type info:%s" % (type(timestamps[0])))
        logger.info("%s" % (timestamps[0]))
    return timestamps
Exemple #27
0
def getLogger(name, parent=None):
    if parent:
        name = parent.name + '.' + name
    return _getLogger(name)
Exemple #28
0
Core MLTools in a python package for creating, examining, and testing models in the .mlmodel
format. In particular, it can be used to:

* Convert existing models to .mlmodel format from popular machine learning tools including:
     Keras, scikit-learn, libsvm, and XGBoost.
* Express models in .mlmodel format through a simple API.
* Make predictions with an .mlmodel (on select platforms for testing purposes).

For more information: http://developer.apple.com/documentation/coreml
"""
from enum import Enum as _Enum
from logging import getLogger as _getLogger

# Backup root logger handlers
_root_logger = _getLogger()
_root_logger_handlers_backup = _root_logger.handlers.copy()

from .version import __version__

# This is the basic Core ML specification format understood by iOS 11.0
SPECIFICATION_VERSION = 1

# New versions for iOS 11.2 features. Models which use these features should have these
# versions, but models created from this coremltools which do not use the features can
# still have the basic version.
_MINIMUM_CUSTOM_LAYER_SPEC_VERSION = 2
_MINIMUM_FP16_SPEC_VERSION = 2

# New versions for iOS 12.0 features. Models which use these features should have these
# versions, but models created from this coremltools which do not use the features can
Exemple #29
0
"""

# publicly visible to plugins
from stolos import argparse_shared as at
from stolos.configuration_backend import TasksConfigBaseMapping, TasksConfigBaseSequence

TasksConfigBaseSequence, TasksConfigBaseMapping
from stolos import api

at, api


# imports hidden from plugins
from logging import getLogger as _getLogger

log = _getLogger("stolos.plugins")
from stolos.exceptions import CodeError as _CodeError


def log_and_raise(err, log_details):
    """The less unexpected way for plugins to fail.

    A helper function that logs the given exception
    and then raises an exception.  Stolos will see this error, mark the
    job as failed and quit.  Plugin exceptions not handled by this function
    will cause Stolos to complain that you have unexpected errors in your
    plugin code.
    """
    log.exception(err, extra=log_details)
    raise _CodeError("Task failed. This particular error message will never appear in logs.")
def processInputData(inputa,
                     fieldSeparator=None,
                     recordSeparator=None,
                     otsdbExcludeCharRegex='[^a-zA-Z\d\-_%s/]+',
                     otsdbExcludeCharReplacer='_'):
    logger = _getLogger(__name__)
    logger.info("in processInputData")

    output = {}
    if fieldSeparator is None or recordSeparator is None:
        return output
    engine = 'c'
    if len(recordSeparator) > 1:
        engine = 'python'
    lines = inputa.strip().split('\n')

    partsAll = [elem.strip().split(recordSeparator) for elem in lines]
    subfields = []
    for selector in range(3):
        subfields.append(
            [elem[selector].split(fieldSeparator) for elem in partsAll])

    for i in range(len(subfields[0])):

        clean_row_part(subfields[0][i],
                       otsdbExcludeCharRegex,
                       otsdbExcludeCharReplacer=otsdbExcludeCharReplacer,
                       excludeChars='')
        clean_row_part(subfields[1][i],
                       otsdbExcludeCharRegex,
                       otsdbExcludeCharReplacer=otsdbExcludeCharReplacer,
                       excludeChars='')
        clean_row_part(subfields[2][i],
                       otsdbExcludeCharRegex,
                       otsdbExcludeCharReplacer=otsdbExcludeCharReplacer,
                       excludeChars='.')

    columns = [elem.split('=')[0] for elem in subfields[0][0]]
    times = [elem.split('=')[0] for elem in subfields[1][0]]
    values = [elem.split('=')[0] for elem in subfields[2][0]]
    dfAll = []
    for currentFieldIndex, currentField in enumerate(subfields):
        d1 = []
        logger.info(currentFieldIndex)
        for indexa, elema in enumerate(currentField):
            try:
                d1.append(
                    {elem.split('=')[0]: elem.split('=')[1]
                     for elem in elema})
            except Exception as e:
                logger.info(e)
                logger.debug(elema)
                logger.debug((d1[0].keys()))
                raise (e)
        dfAll.append(_DataFrame(d1))
    dfMaster = _concat(dfAll, axis=1)

    logger.info('before timestamp normalization')
    timestamp = normalizeTimeStamps(dfMaster, time_columns=times)
    dfMaster['timestamp'] = timestamp
    if logger.isEnabledFor(_DEBUG):
        logger.debug("type debug:%s" % (type(timestamp[0])))
        logger.debug("%s" % (timestamp[0]))
        logger.debug("type debug:%s" % (type(dfMaster['timestamp'][0])))
        logger.debug("%s" % (dfMaster['timestamp'][0]))
    output = {}
    output['df'] = dfMaster
    output['time_columns'] = times
    output['value_columns'] = values
    output['group_columns'] = columns
    logger.info("times=%s, values=%s, columns=%s" % (times, values, columns))
    logger.info("leave processInputData")
    return output
Exemple #31
0
def getLogger(name, parent=None):
    if parent:
        name = parent.name + '.' + name
    logger = _getLogger(name)
    logger.settings = settings
    return logger
Exemple #32
0
                      of segments per control point
    res.export: format string to use when exporting floating point vertices
    """
    def __init__(self, **kwargs):
        self.seg_frac = .05
        self.seg_angle = .08
        self.max_sections = 10
        self.min_sections = 5
        self.export = '.5f'


tol_path = NumericalTolerancePath()
res_path = NumericalResolutionPath()

# logging
log = _getLogger('trimesh')
log.addHandler(_NullHandler())


def _log_time(method):
    def timed(*args, **kwargs):
        tic = time_function()
        result = method(*args, **kwargs)
        log.debug('%s executed in %.4f seconds.', method.__name__,
                  time_function() - tic)
        return result

    timed.__name__ = method.__name__
    timed.__doc__ = method.__doc__
    return timed
Exemple #33
0
This module contains general utility functions and shared constants used by other TritonScraper modules.

:copyright: (c) 2010 by Christopher Rebert.
:license: MIT, see :file:`LICENSE.txt` for more details.
"""

from itertools import izip_longest as _izip_longest
from decimal import Decimal as _Decimal
from logging import getLogger as _getLogger

from triton_scraper.config import LOGGER_NAME as _LOGGER_NAME

from lxml.etree import XPath

#: Floating-point Not-a-Number (NaN) value.
NaN = _Decimal('NaN')
#: Floating-point infinity value; i.e. float('infinity')
INFINITY = float('infinity')

# Common XPath component
RELATIVE_PREFIX = "descendant-or-self::node()"

# TritonScraper's logger
LOGGER = _getLogger(_LOGGER_NAME)

# From the itertools cookbook: http://docs.python.org/library/itertools.html#recipes
def grouper(n, iterable, fillvalue=None):
    "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return _izip_longest(fillvalue=fillvalue, *args)
Exemple #34
0
def getLogger(name, parent=None):
    if parent:
        name = parent.name + '.' + name
    logger = _getLogger(name)
    logger.settings = settings
    return logger
from graphlab.connect.aws._ec2 import LicenseValidationException


from graphlab.connect.aws._ec2 import get_credentials as _get_credentials
import graphlab as _gl
import graphlab.connect as _mt

# since _predictive_service_environment imports these, need to have them defined first
_MAX_CREATE_TIMEOUT_SECS = 600 # 10m

from _predictive_service._predictive_service_environment import Ec2PredictiveServiceEnvironment as _Ec2PredictiveServiceEnvironment
from _predictive_service._predictive_service_environment import LocalPredictiveServiceEnvironment as _LocalPredictiveServiceEnvironment
from _predictive_service._file_util import parse_s3_path as _parse_s3_path, s3_recursive_delete as _s3_recursive_delete, s3_delete_key as _s3_delete_key
from _predictive_service._predictive_service import PredictiveService as _PredictiveService

_logger = _getLogger(__name__)
_name_checker = _compile('^[a-zA-Z-]+$')

def create(name, environment, state_path, description = None, api_key = None, admin_key = None,
           ssl_credentials = None):
    '''
    Launch a Predictive Services cluster. This cluster can currently be launched
    on EC2 by specifying an EC2 environment.

    Parameters
    ----------
    name : str
        The name of the Predictive Service that will be launched.

        This string can only contain: a-z, A-Z and hyphens.
Exemple #36
0
    res.export: format string to use when exporting floating point vertices
    """

    def __init__(self, **kwargs):
        self.seg_frac = 0.05
        self.seg_angle = 0.08
        self.max_sections = 10
        self.min_sections = 5
        self.export = ".5f"


tol_path = NumericalTolerancePath()
res_path = NumericalResolutionPath()

### logging
log = _getLogger("trimesh")
log.addHandler(_NullHandler())


def _log_time(method):
    def timed(*args, **kwargs):
        tic = time_function()
        result = method(*args, **kwargs)
        log.debug("%s executed in %.4f seconds.", method.__name__, time_function() - tic)
        return result

    timed.__name__ = method.__name__
    timed.__doc__ = method.__doc__
    return timed

def mainProcessor(input_file_glob=None,
                  apiEntryPointList=[
                      'cviadqat07.office.comscore.com',
                  ],
                  processes=None,
                  metric='bID.gID.daySession.daily.input_dataExploded',
                  tagsToKeep=None,
                  compressTags=False,
                  overrideMillisecond=False,
                  useInflux=False,
                  encoding=None,
                  recordSeparator=None,
                  fieldSeparator=None):

    logger = _getLogger(__name__)
    logger.info("in main")

    startTimeMain = _time.time()
    inputa = ""
    """Read the file as string"""

    allFiles = sorted(_glob.glob(input_file_glob))
    logger.info('processing %s' % (str(allFiles)))
    counter = 0
    for file_ in allFiles:
        logger.info(f"Processing file: {file_}")
        inputa = readData(file_)

        inputData = decodeData(inputa, encoding)
        logger.info(
            f"Time taken to read the file: {getTimeTaken(startTimeMain, _time.time())}"
        )

        data = processInputData(inputData,
                                recordSeparator=recordSeparator,
                                fieldSeparator=fieldSeparator)
        if overrideMillisecond:
            #increment timestamp field
            df = data['df']
            df['timestamp'] *= 1000
            milliVector = _np.arange(1, df.shape[0] + 1)
            df['timestamp'] += milliVector

        splitedDF = split(data['df'], chunkSize=100)
        valueColumns = data['value_columns']
        groupColumns = data['group_columns']
        timeColumns = data['time_columns']

        logger.info(f"Starting put API call at: {_time.ctime()}")
        if useInflux:
            logger.info('before import into influx tsdb')
            #influx client needs leading http decorators removed
            r1 = _compile(r"^https?://(www\.)?")
            firstEntry = apiEntryPointList[0]
            firstEntry = r1.sub('', firstEntry)
            influxEntry = firstEntry.split(':')[0]
            influxPort = firstEntry.split(':')[1]
            cleanDataframe = data['df'].apply(_to_numeric, errors='ignore')
            logger.info('before import into influx tsdb %s' % (metric))
            result = writeDataFrameToInfluxDB(df=cleanDataframe,
                                              valueColumns=valueColumns,
                                              groupColumns=groupColumns,
                                              apiEntryPoint=influxEntry,
                                              port=influxPort,
                                              database=metric,
                                              measurement='autoload')
            logger.info(
                'successfully imported %d of %d records into influxdb' %
                (sum(result), len(result)))

        else:
            host_tag = False
            check_tsdb_alive = False

            putApiEndPoint = [
                f"{apiEntryPoint}/api/put/"
                for apiEntryPoint in apiEntryPointList
            ]
            assignApiEndPoint = [
                f"{apiEntryPoint}/api/uid/assign/"
                for apiEntryPoint in apiEntryPointList
            ]
            logger.info(putApiEndPoint)
            logger.info(assignApiEndPoint)

            # result = [
            #	 writeDataFrameToOpenTsdb(
            #		 sdf.reset_index(drop=True),
            #		 valueColumns,
            #		 groupColumns,
            #		 apiEntryPoint,
            #		 putApiEndPoint,
            #		 assignApiEndPoint,
            #		 port=4248,
            #		 metric=metric,
            #		 host_tag=False,
            #		 check_tsdb_alive=False) for sdf in splitedDF
            # ]
            loadBalancerCount = len(putApiEndPoint)
            logger.info('before import into opentsdb %s' % (metric))
            with _multiprocessing.Pool(processes=processes *
                                       loadBalancerCount) as pool:
                result = pool.starmap_async(
                    writeDataFrameToOpenTsdb,
                    [(sdf.reset_index(drop=True), valueColumns, groupColumns,
                      None, putApiEndPoint[indexa % loadBalancerCount],
                      assignApiEndPoint[indexa % loadBalancerCount], None,
                      metric, host_tag, check_tsdb_alive, 50, tagsToKeep,
                      50000, compressTags, overrideMillisecond)
                     for indexa, sdf in enumerate(splitedDF)],
                    chunksize=None,
                    callback=None,
                    error_callback=None)
                result.get(timeout=None)

        logger.info(f"End put API call at: {_time.ctime()}")
        logger.info(
            f"Time taken for main: {getTimeTaken(startTimeMain, _time.time())}"
        )
Exemple #38
0
                                   planar      = 1e-5,
                                   seg_frac    = .05,
                                   seg_angle   = .25,
                                   aspect_frac = .1,
                                   radius_frac = 1e-2,
                                   radius_min  = 1e-2,
                                   radius_max  = 50,
                                   tangent     = .0175)
res_path = _NumericalResolutionPath(seg_frac     = .04,
                                    seg_angle    = .18,
                                    max_sections = 10,
                                    min_sections = 5,
                                    export       = '.5f')

### logging
log = _getLogger('trimesh')
log.addHandler(_NullHandler())
def _log_time(method):
    def timed(*args, **kwargs):
        tic    = time_function()
        result = method(*args, **kwargs)
        log.debug('%s executed in %.4f seconds.',
                   method.__name__,
                   time_function()-tic)
        return result
    timed.__name__ = method.__name__
    timed.__doc__  = method.__doc__
    return timed

### exceptions
class MeshError(Exception):
def openTSDB_data_processor(metric_names=None,
                            query_string=None,
                            query_offset=20,
                            tolerance=0.0):

    logger = _getLogger(__name__)
    logger.info('read failure tolerance is %s' % tolerance)
    logger.info('number of metric names: %d' % (len(metric_names)))
    metrics = list(
        set([
            'm=none:' + elem['metric']
            for elem in _itertools.chain.from_iterable(metric_names)
        ]))
    logger.info('number of metrics as set: %d' % (len(metrics)))
    response_df_list = [(_requests.get('%s%s' % (query_string, "&".join(
        metrics[queryIndex:queryIndex + query_offset]))).json())
                        for queryIndex in range(0, len(metrics), query_offset)]

    flattened_list = list(_itertools.chain(*response_df_list))
    logger.info('number of elements in response list: %d' %
                (len(flattened_list)))
    if len(metrics) > 0:
        logger.info(str(metrics[:1]))
    super_dict = {'metric': [], 'dps': [], 'tags': [], 'aggregateTags': []}
    num_defective_records = 0
    num_correct_records = 0
    for ite in flattened_list:
        if ite == 'error' or type(ite) is not dict:
            num_defective_records += 1
        else:
            merge_dicts(super_dict, ite)
            num_correct_records += 1

    max_defective_records = num_defective_records * query_offset
    total_num_queries = max_defective_records + num_correct_records
    logger.info('number of read request errors are: %s' %
                num_defective_records)
    logger.info('max number of failed reads are: %s' % max_defective_records)
    logger.info('number of total queries sent are %s' % total_num_queries)
    if total_num_queries > 0:
        read_error_perc = max_defective_records / total_num_queries
    else:
        #in state where metrics were sent to tsdb for query, but no time series were returned.
        logger.warn('total_num_queries is zero.  no input data found in tsdb')
        raise ValueError("NoRecordsFound")

    if read_error_perc > tolerance:
        raise ValueError(
            'Stopping execution as read errors exceeded tolerance for num metrics queried that do not exist in OTSDB. Read error percent is %s and tolerance is %s'
            % (read_error_perc, tolerance))

    causal_data_frame = _DataFrame.from_dict(super_dict)

    new_df = causal_data_frame.drop(
        ['dps', 'aggregateTags', 'tags'],
        1).assign(**_DataFrame(causal_data_frame.dps.values.tolist()))
    transposed_df = new_df.T
    cleaned_causal_df = transposed_df.rename(columns=dict(
        zip(transposed_df.columns.tolist(), transposed_df.iloc[0].astype(
            str)))).drop(transposed_df.index[0])
    numeric_causal_df = cleaned_causal_df.apply(_to_numeric, errors='ignore')
    return numeric_causal_df
Exemple #40
0
def get_logger(name):
    logger = _getLogger(name)
    logger.addFilter(context_filter)
    return logger
 def __init__(self, module=__name__):
     self.logger = _noLogger()
     if DEBUG is True:
         self.logger = _getLogger('%s-%s(%x)' %(module, self.__class__, id(self)))
from logging import getLogger   as _getLogger

### logging
log = _getLogger('truenorth-training')
Exemple #43
0
else:
    VERSION_INFO = tuple(_metadata.VERSION.split('.'))

# 2. Imports
__all__ = [
        'board',
        'config',
        'game',
        'logger',
        'mainloop',
        'metadata',
        'metadata',
        'piece',
        'square',
        'utils']

# 3. Configure settings
from pyrochess.config import SETTINGS

# 4. Configure logging
from logging import getLogger as _getLogger
from pyrochess.logger import init_logging as _init_logging
_init_logging(SETTINGS)
_log = _getLogger(_metadata.PROGRAM)
_log.debug("=== {} v{} begun at: {} ===".format(__package__,
                                                VERSION,
                                                _DATE))
# 5. Expose main and other chained imports
from pyrochess.cli import main