Ejemplo n.º 1
0
  def __GetTimeOutForTask(cls, task):
    """Returns the timeout for the task.

    Args:
      task: string: The task for which the timeout should be prepared.

    Returns:
      int: The timeout in seconds.
    """
    timeout = FileUtils.FileContents(task + '.timeout')
    if not timeout:
      timeout = FileUtils.FileContents(os.path.join(PipelineUtils.TaskDirName(task), 'timeout'))

    if not timeout: return Flags.ARGS.timeout

    timeout = re.sub('\s*', '', timeout)
    timeout_parts = re.split('(\d+)', timeout)
    if len(timeout_parts) < 3:
      TermColor.Warning('Ignoring invalid timeout [%s] for task: %s' % (timeout, task))
      return Flags.ARGS.timeout

    timeout = float(timeout_parts[1])
    annotation = timeout_parts[2]
    if not annotation: return timeout
    elif annotation == 'd': timeout *= 86400
    elif annotation == 'h': timeout *= 3600
    elif annotation == 'm': timeout *= 60
    elif annotation == 'ms': timeout *= 0.001
    elif annotation == 'us': timeout *= 0.000001
    return timeout
Ejemplo n.º 2
0
def read_js_version(javascript_version_file=None):
    '''Read the JS version stamp attached when building the production JS
    bundles to improve frontend error reporting. If no version file was added,
    we do not support versioning for this deployment.
    '''
    javascript_version_file = javascript_version_file or FileUtils.GetAbsPathForFile(
        'web/public/build/version.txt')
    # Only production has JS versions. If we are in production but the version
    # file does not exist, FileUtils will not resolve the absolute path and
    # will return None. Some deployments do not support versioning.
    if not IS_PRODUCTION or not javascript_version_file:
        return ''

    return FileUtils.FileContents(javascript_version_file).strip()
Ejemplo n.º 3
0
def build_indexing_task(version):
    # Create a set of absolute paths for the input path list
    input_paths = Flags.ARGS.data_files
    cwd = os.getcwd()
    full_paths = set()
    for path in input_paths:
        full_paths.update(build_absolute_paths(path, cwd))

    assert len(full_paths) > 0, 'No matching paths found for indexing!'

    # Parse the task definition overrides if specified. Defaults to None
    task_template_json = FileUtils.FileContents(Flags.ARGS.task_template_file)
    metrics_spec_json = FileUtils.FileContents(Flags.ARGS.metrics_spec_file)
    tuning_config_json = FileUtils.FileContents(Flags.ARGS.tuning_config_file)

    # If no datasource name is specified, generate a valid site datasource
    # and use its name.
    datasource_name = (Flags.ARGS.datasource_name
                       or SiteDruidDatasource(DEPLOYMENT_NAME, TODAY).name)

    min_data_date = datetime.datetime.strptime(Flags.ARGS.min_data_date,
                                               DRUID_DATE_FORMAT)
    max_data_date = datetime.datetime.strptime(Flags.ARGS.max_data_date,
                                               DRUID_DATE_FORMAT)
    return DruidIndexingTaskBuilder(
        datasource_name,
        DIMENSIONS,
        BaseRowType.DATE_FIELD,
        full_paths,
        min_data_date,
        max_data_date,
        task_template_json,
        metrics_spec_json,
        tuning_config_json,
        version,
    )
Ejemplo n.º 4
0
def task_contains_new_data(indexing_task, cur_datasource, cur_version):
    # Check to see if the current datasource has an indexing hash we
    # can compare to
    cur_hash_file = get_hash_storage_path(cur_datasource, cur_version)
    if not os.path.isdir(Flags.ARGS.task_hash_dir):
        raise RuntimeError('You need to create the task hash dir, %s' %
                           Flags.ARGS.task_hash_dir)
    if not os.path.isfile(cur_hash_file):
        return True

    # Each line of the hash file contains a separate file hash. Compare
    # the current file hashes with the new file hashes to see if there is a
    # difference.
    # NOTE(stephen): Intentionally not using a set here since it's possible
    # for an indexing job to index the same file twice on purpose.
    cur_file_hash = sorted(FileUtils.FileContents(cur_hash_file).split('\n'))
    new_file_hash = sorted(indexing_task.get_file_hashes())
    return cur_file_hash != new_file_hash
Ejemplo n.º 5
0
class DruidIndexingTaskBuilder(object):
    _DEFAULT_METRICS_SPEC = FileUtils.FileContents(DEFAULT_METRICS_SPEC_FILE)
    _DEFAULT_TASK_TEMPLATE = FileUtils.FileContents(DEFAULT_TASK_TEMPLATE_FILE)
    _DEFAULT_TUNING_CONFIG = FileUtils.FileContents(DEFAULT_TUNING_CONFIG_FILE)

    def __init__(
        self,
        datasource_name,
        dimensions,
        date_column,
        paths,
        start_date,
        end_date,
        task_template_json=None,
        metrics_spec_json=None,
        tuning_config_json=None,
        version=None,
    ):
        self._datasource = datasource_name
        self._dimensions = dimensions
        self._date_column = date_column
        self._start_date = start_date.strftime(DRUID_DATE_FORMAT)
        self._end_date = end_date.strftime(DRUID_DATE_FORMAT)
        self._task_template_json = str.strip(task_template_json
                                             or self.default_task_template())
        self._metrics_spec_json = str.strip(metrics_spec_json
                                            or self.default_metrics_spec())
        self._tuning_config_json = str.strip(tuning_config_json
                                             or self.default_tuning_config())
        self._version = version
        # If an explicit version has been set, add it to the tuning config.
        if version:
            tuning_config = json.loads(self._tuning_config_json)
            tuning_config['version'] = version
            tuning_config['useExplicitVersion'] = True
            self._tuning_config_json = json.dumps(tuning_config)

        # Validate the input file paths before building the path input spec
        _ = [_validate_file_path(p) for p in paths]
        self._paths = paths
        self._input_spec = build_input_spec(paths)
        self._task_dict = self._build_task()

    def _build_task(self):
        raw_json = (self._task_template_json.replace(
            '{{INPUT_SPEC_JSON}}', self._input_spec).replace(
                '{{DATASOURCE_NAME}}', self.datasource).replace(
                    '{{DATA_START_DATE}}', self._start_date).replace(
                        '{{DATA_END_DATE}}', self._end_date).replace(
                            '{{DIMENSIONS_JSON}}',
                            json.dumps(self._dimensions)).replace(
                                '{{DATE_COLUMN_NAME}}',
                                self._date_column).replace(
                                    '{{METRICS_SPEC_JSON}}',
                                    self._metrics_spec_json).replace(
                                        '{{TUNING_CONFIG_JSON}}',
                                        self._tuning_config_json).strip())
        return json.loads(raw_json)

    @property
    def datasource(self):
        return self._datasource

    @property
    def version(self):
        return self._version

    @property
    def task_definition(self):
        return self._task_dict

    # Compute a reproducible representation of the files designated for
    # ingestion.
    def get_task_hash(self):
        return '\n'.join(sorted(self.get_file_hashes()))

    # Create a list of file hashes for the set of files to be indexed
    def get_file_hashes(self):
        return [compute_file_hash(p) for p in self._paths]

    # Print a human readable overview of what this indexing task will do
    def print_overview(self):
        print('Indexing Task Overview')
        print('Datasource: %s' % self.datasource)
        print('Version: %s' % self.version)
        print('Dimensions: %s' %
              json.dumps(sorted(self._dimensions), indent=2))
        print('Date column: %s' % self._date_column)
        print('Start date: %s' % self._start_date)
        print('End date: %s' % self._end_date)
        print('Paths: %s' % json.dumps(sorted(self._paths), indent=2))

    @classmethod
    def default_metrics_spec(cls):
        return cls._DEFAULT_METRICS_SPEC

    @classmethod
    def default_task_template(cls):
        return cls._DEFAULT_TASK_TEMPLATE

    @classmethod
    def default_tuning_config(cls):
        return cls._DEFAULT_TUNING_CONFIG
Ejemplo n.º 6
0
import os

from pylib.file.file_utils import FileUtils

# Detect if the current code is running within EC2
_BIOS_VERSION_FILE = '/sys/devices/virtual/dmi/id/bios_version'
RUNNING_IN_EC2 = os.path.isfile(
    _BIOS_VERSION_FILE
) and 'amazon' in FileUtils.FileContents(_BIOS_VERSION_FILE)