Beispiel #1
0
    def get_pipeline_status(cache_id, name):
        status = status_mgr(f'{FILE_PATH}/tmp')
        status.initialize()
        pipeline_status = status.get(f'./{cache_id}/{name}')
        start_time = None
        finish_time = None
        pipeline_id = None
        status = None
        success = None
        error_log = None

        if pipeline_status and pipeline_status.last_execution:
            last_execution = pipeline_status.last_execution
            start_time = last_execution.start_time
            pipeline_id = last_execution.pipeline_id
            finish_time = last_execution.finish_time
            success = last_execution.success
            error_log = last_execution.error_log
            status = pipeline_status.state()

        return {
            'start_time': start_time,
            'finish_time': finish_time,
            'pipeline_id': pipeline_id,
            'status': status,
            'error_log': error_log,
            'success': success,
        }
Beispiel #2
0
    def specs(self, argument, root_dir, ignore_missing_deps=False):
        status_manager = status_mgr(root_dir)
        specs = []
        for spec in pipelines(ignore_missing_deps=ignore_missing_deps,
                              root_dir=root_dir,
                              status_manager=status_manager):
            if match_pipeline_id(argument, spec.pipeline_id):
                specs.append(spec)

        return specs
Beispiel #3
0
def pipelines(prefixes=None,
              ignore_missing_deps=False,
              root_dir='.',
              status_manager=None):

    specs: Iterator[PipelineSpec] = find_specs(root_dir)
    hasher = HashCalculator()
    if status_manager is None:
        status_manager = status_mgr()
    if prefixes is None:
        prefixes = ('', )
    while specs is not None:
        deferred = []
        found = False

        for spec_ in specs:
            spec: PipelineSpec = spec_

            if not any(
                    spec.pipeline_id.startswith(prefix)
                    for prefix in prefixes):
                continue

            if (spec.pipeline_details is not None and validate_pipeline(
                    spec.pipeline_details, spec.validation_errors)):

                resolve_processors(spec)
                process_schedules(spec)

                try:
                    hasher.calculate_hash(spec, status_manager,
                                          ignore_missing_deps)
                    found = True
                except DependencyMissingException as e_:
                    e: DependencyMissingException = e_
                    deferred.append((e.spec, e.missing))
                    continue

            yield spec

        if found and len(deferred) > 0:
            specs = iter((x[0] for x in deferred))
        else:
            for spec, missing in deferred:
                spec.validation_errors.append(
                    SpecError(
                        'Missing dependency',
                        'Failed to find a dependency: {}'.format(missing)))
                yield spec
            specs = None
# -*- coding: utf-8 -*-
import json
import threading
import time
from http.server import HTTPServer, BaseHTTPRequestHandler

from datapackage_pipelines.manager import execute_pipeline, run_pipelines
from datapackage_pipelines.specs.specs import pipelines
from datapackage_pipelines.utilities.execution_id import gen_execution_id
from datapackage_pipelines.status import status_mgr


called_hooks = []
progresses = 0
status = status_mgr()

class SaveHooks(BaseHTTPRequestHandler):

    def do_POST(self):
        global progresses
        content_len = int(self.headers.get('content-length', 0))
        post_body = self.rfile.read(content_len)
        hook = json.loads(post_body)
        if hook['event'] != 'progress':
            called_hooks.append(hook)
        else:
            progresses += 1
        self.send_response(200)
        self.end_headers()
        return
Beispiel #5
0
    def run_pipeline(self, cache_id=None, verbose=False, num_rows=-1, background=False):
        '''
        Starts a thread that runs the datapackage pipelines for this pipeline

        - On fail, return error message
        - On success, return datapackage.json contents and the resulting csv

        - On both fail and success return a status code and a
        unique id that can be passed back in to this function
        to use the cache

        - if run in the background use the static functions get_pipeline_status and
          get_pipeline_data to access the results.
        '''
        if not cache_id:
            cache_id = str(uuid.uuid1())

        # We have to check the cache_id value since it's
        # potentially being passed in from the outside
        pattern = re.compile(
            r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}'
            r'-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
        )
        if not pattern.match(cache_id):
            raise Exception('The unique ID that was provided was not in uuid format')

        ''' IMPORTANT '''
        # If the file structure between this file and the tmp folder
        # ever changes this code must change
        cache_dir = f'{ROOT_DIR}/{cache_id}'
        results_folder = f'{cache_dir}/results'
        # Create the directory and file
        if not os.path.exists(cache_dir):
            start = time.time()
            os.makedirs(cache_dir)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the directories')
        try:
            start = time.time()
            self.save_to_file(f'{cache_dir}/pipeline-spec.yaml.original', steps=self._steps)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the pipeline-spec.original.yaml file')
            # Create a new save step so we can access the data here
            new_save_step = {
                'run': 'dump_to_path',
                'parameters': {
                    'out-path': results_folder,
                    'temporal_format_property': 'outputFormat',
                }
            }
            new_steps = self._steps + [new_save_step]
            start = time.time()
            self.save_to_file(f'{cache_dir}/pipeline-spec.yaml', steps=new_steps)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the pipeline-spec.yaml file')

            # Remove the results folder
            start = time.time()
            shutil.rmtree(results_folder, ignore_errors=True)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'removing the results folder')

            start = time.time()
            pipeline_id = f'./{cache_id}/{self.name}'
            status = status_mgr(ROOT_DIR)
            status.initialize()
            pipeline_status = status.get(pipeline_id)
            last_execution = pipeline_status.last_execution
            BcodmoPipeline.log_slow_compute(start, cache_id, 'checking the status before creating a thread')
            old_start_time = None
            if last_execution:
                old_start_time = last_execution.start_time

            start = time.time()
            x = threading.Thread(target=self.run_pipeline_thread, args=(cache_id, verbose,), daemon=True)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the thread')
            start = time.time()
            x.start()
            BcodmoPipeline.log_slow_compute(start, cache_id, 'starting the thread')

            if background:
                while True:
                    # Loop until the next pipeline has started
                    start = time.time()
                    status = status_mgr(ROOT_DIR)
                    status.initialize()
                    pipeline_status = status.get(pipeline_id)
                    last_execution = pipeline_status.last_execution
                    BcodmoPipeline.log_slow_compute(start, cache_id, 'checking the status after creating the thread')
                    if last_execution and last_execution.start_time != old_start_time:
                        break
                    if x.is_alive():
                        time.sleep(0.1)
                    else:
                        return {
                            'status_code': 1,
                            'cache_id': cache_id,
                            'yaml': self.get_yaml(),
                            'error_text': 'There was an unknown error in starting the pipeline',
                        }

                return {
                    'status_code': 0,
                    'cache_id': cache_id,
                    'yaml': self.get_yaml(),
                }

            else:
                # Join the thread
                x.join()
                status_dict = BcodmoPipeline.get_pipeline_status(cache_id, self.name)
                if status_dict['success']:
                    pipeline_data = BcodmoPipeline.get_pipeline_data(cache_id, num_rows)
                    return {
                        'status_code': 0,
                        'cache_id': cache_id,
                        'yaml': self.get_yaml(),
                        'datapackage': pipeline_data['datapackage'],
                        'resources': pipeline_data['resources'],
                    }
                else:
                    return {
                        'status_code': 1,
                        'cache_id': cache_id,
                        'yaml': self.get_yaml(),
                        'error_text': status_dict.error_log,
                    }

        finally:
            try:
                start = time.time()
                # Clean up the directory, deleting old folders
                cur_time = time.time()
                dirs = [
                    folder_name for folder_name in os.listdir(f'{FILE_PATH}/tmp')
                    if not folder_name.startswith('.')
                ]
                for folder_name in dirs:
                    folder = f'{FILE_PATH}/tmp/{folder_name}'
                    st = os.stat(folder)
                    modified_time = st.st_mtime
                    age = cur_time - modified_time

                    if age > DAY * 30:
                        shutil.rmtree(folder)
                BcodmoPipeline.log_slow_compute(start, cache_id, 'checking age status of folders after complete')
            except Exception as e:
                logger.info(f'There was an error trying to clean up folder: {str(e)}')
                logger.error(vars(e))
Beispiel #6
0
    def run_pipeline_thread(self, cache_id, verbose):
        cache_dir = f'{ROOT_DIR}/{cache_id}'
        pipeline_spec_path = f'{cache_dir}/pipeline-spec.yaml'
        pipeline_id = f'./{cache_id}/{self.name}'

        dpp_command_path, processor_path = self._get_version_paths(self.version)
        os.environ['DPP_PROCESSOR_PATH'] = processor_path
        try:
            # Activate the correct virtual environment
            start = time.time()
            self._activate_virtualenv(self.version)
            BcodmoPipeline.log_slow_compute(start, cache_id, 'activating the virtualenv')

            # Set the verbose string if necessary
            if verbose:
                command_list = [dpp_command_path, 'run', '--verbose', pipeline_id]
            else:
                command_list = [dpp_command_path, 'run', pipeline_id]

            # Start the dpp process
            start = time.time()
            p = subprocess.Popen(
                command_list,
                stderr=subprocess.DEVNULL,
                stdout=subprocess.DEVNULL,
                cwd=ROOT_DIR,
            )
            BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the process')

            sleep_timer = 1
            start = time.time()
            while p.poll() is None:
                BcodmoPipeline.log_slow_compute(start, cache_id, 'polling the process')
                time.sleep(1)
                if sleep_timer != 5:
                    sleep_timer += 1

                # The pipeline-spec.yaml was deleted, need to end the process now
                if not os.path.exists(pipeline_spec_path):
                    # Get the chilren of the dpp process (the dpp slave process)
                    children = [child.pid for child in psutil.Process(p.pid).children()]

                    # Terminate the parent process
                    p.terminate()
                    # Terminate all of the children processes
                    for child in children:
                        os.kill(child, signal.SIGTERM)

                    # Invalidate the pipeline in the dpp backend
                    status = status_mgr(ROOT_DIR)
                    status.initialize()
                    pipeline_status = status.get(pipeline_id)
                    if pipeline_status:
                        last_execution = pipeline_status.last_execution
                        if last_execution:
                            last_execution.finish_execution(
                                False,
                                {},
                                ['This pipeline was stopped by laminar'],
                            )

                    # One last try
                    if p.poll() is None:
                        p.kill()
                        break
                start = time.time()
        finally:
            # Deactivate the virtualenv - not sure if this is necessary since it is a thread
            start = time.time()
            self._deactivate_virtualenv()
            BcodmoPipeline.log_slow_compute(start, cache_id, 'deactivating the virtualenv')

        # If the pipeline-spec.yaml file has been deleted since this thread started, the
        # whole cache_id folder should be deleted
        if not os.path.exists(pipeline_spec_path) and os.path.exists(cache_dir):
            shutil.rmtree(cache_dir)
Beispiel #7
0
def register_all_pipelines(root_dir='.'):
    for spec in pipelines(root_dir=root_dir):
        ps = status_mgr().get(spec.pipeline_id)
        ps.init(spec.pipeline_details, spec.source_details,
                spec.validation_errors, spec.cache_hash)
        ps.save()