def __run_sample(self, sample, analysis_folder, machine_run_folder): Logger.info( 'Launching analytical pipeline "%s" with version "%s" for sample %s.' % (self.pipeline['name'], self.version, sample), task_name=self.task) read1, read2 = self.__fetch_reads(sample, analysis_folder, machine_run_folder) pipeline_params = { 'SAMPLE': { 'value': sample }, 'READ1': { 'value': read1, 'type': 'input' }, 'READ2': { 'value': read2, 'type': 'input' }, 'OUTPUT_FOLDER': { 'value': analysis_folder, 'type': 'output' } } run = self.api.launch_pipeline(self.pipeline['id'], self.version, pipeline_params, instance=self.instance_type, disk=self.instance_disk, parent_run_id=os.environ['RUN_ID']) return run['id']
def check_or_install_fuse(self): fuse_type = os.getenv('CP_S3_FUSE_TYPE', FUSE_GOOFYS_ID) if fuse_type == FUSE_GOOFYS_ID: fuse_installed = self.execute_and_check_command( 'install_s3_fuse_goofys') return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID elif fuse_type == FUSE_S3FS_ID: fuse_installed = self.execute_and_check_command( 'install_s3_fuse_s3fs') if fuse_installed: return FUSE_S3FS_ID else: Logger.warn( "FUSE {fuse_type} was preferred, but failed to install, will try to setup default goofys" .format(fuse_type=fuse_type), task_name=self.task_name) fuse_installed = self.execute_and_check_command( 'install_s3_fuse_goofys') return FUSE_GOOFYS_ID if fuse_installed else FUSE_NA_ID fi else: Logger.warn( "FUSE {fuse_type} type is not defined for S3 fuse".format( fuse_type=fuse_type), task_name=self.task_name) return FUSE_NA_ID
def launch(self, instance_size, instance_disk, docker_image, cmd, wait_finish=False): running = 0 Logger.info('Starting {} sample(s) scheduling.'.format( len(self.run_dirs)), task_name=self.TASK_NAME) for folder in self.run_dirs: self.launch_pipeline(folder, self.param_names, instance_size, instance_disk, docker_image, cmd) running = running + 1 Logger.info('Processing {} sample(s).'.format(running), task_name=self.TASK_NAME) Logger.info('Successfully scheduled {} sample(s).'.format(running), task_name=self.TASK_NAME) if wait_finish: Logger.info('Waiting for all runs to finish.', task_name=self.TASK_NAME) self.wait_all_samples_finish() Logger.success('All child pipeline successfully finished.', task_name=self.TASK_NAME)
def perform_transfer(self, path, source, destination, cluster, upload, rules=None): Logger.info( 'Uploading files from {} to {}'.format(source, destination), self.task_name) if path.type == PathType.HTTP_OR_FTP or cluster is None or self.is_file( source): if upload or self.rules is None: S3Bucket().pipe_copy(source, destination, TRANSFER_ATTEMPTS) else: S3Bucket().pipe_copy_with_rules(source, destination, TRANSFER_ATTEMPTS, self.rules) else: common_folder = os.path.join(os.environ['SHARED_WORK_FOLDER'], 'transfer') applied_rules = None if upload else rules chunks = self.split_source_into_chunks(cluster, source, destination, common_folder, applied_rules) transfer_pool = Pool(len(chunks)) transfer_pool.map(transfer_async, chunks) shutil.rmtree(common_folder, ignore_errors=True)
def build_dts_path(self, path, dts_registry, input_type): for prefix in dts_registry: if path.startswith(prefix): if not self.bucket: raise RuntimeError( 'Transfer bucket shall be set for DTS locations') relative_path = path.replace(prefix, '') s3_path = self.join_paths(self.bucket, relative_path) if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info( 'Found remote {} path {} matching DTS prefix {}. ' 'It will be uploaded to bucket path {} and localized {} {}.' .format( input_type, path, prefix, s3_path, 'from' if input_type == ParameterType.OUTPUT_PARAMETER else 'to', local_path), task_name=self.task_name) return LocalizedPath(path, s3_path, local_path, PathType.DTS, prefix=prefix) raise RuntimeError( 'Remote path %s does not match any of DTS prefixes.')
def create_directory(self, path): result = common.execute_cmd_command( 'mkdir -p {path}'.format(path=path), silent=True) if result != 0: Logger.warn( 'Failed to create mount directory: {path}'.format(path=path), task_name=self.task_name) return False return True
def execute_mount(self, command, params): result = common.execute_cmd_command(command, silent=True) if result == 0: Logger.info('-->{path} mounted to {mount}'.format(**params), task_name=self.task_name) else: Logger.warn( '--> Failed mounting {path} to {mount}'.format(**params), task_name=self.task_name)
def read(cls, report_file, task): Logger.info("Reading Flagstats report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: line_index = 0 for line in report.readlines(): if line_index < 2: line_index += 1 continue return int(line.split('+')[0].strip())
def __fill_trim_data(self, sample_metrics): Logger.info("Fetching data from FASTQC reports after trimming.", task_name=self.task) r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader \ .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R1.trimmed_fastqc.zip"), self.task) r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader \ .read(os.path.join(self.folder, "FastQC_Trimmed", self.file_suffix + ".Trimmomatic.R2.trimmed_fastqc.zip"), self.task) sample_metrics["ReadsAfterTrim"] = r1_total_reads + r2_total_reads
def __fill_starting_data(self, sample_metrics): Logger.info("Fetching data from FASTQC Initial reports.", task_name=self.task) r1_total_reads, r1_poor_reads, r1_gc, r1_read_length = FastQCReader\ .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R1_fastqc.zip"), self.task) r2_total_reads, r2_poor_reads, r2_gc, r2_read_length = FastQCReader\ .read(os.path.join(self.folder, "FastQC_Initial", self.sample + "_R2_fastqc.zip"), self.task) sample_metrics["StartingReads"] = r1_total_reads + r2_total_reads sample_metrics["QCFailedReads"] = r1_poor_reads + r2_poor_reads sample_metrics["ReadLength"] = r1_read_length sample_metrics["GC"] = r1_gc
def read(cls, report_file, task): Logger.info("Reading Coverage report from file %s." % report_file, task_name=task) total_bases = 0 total_coverage = 0 with open(report_file, 'r') as report: for line in report.readlines(): if line: total_bases += 1 total_coverage += int(line.split("\t")[2]) return 0 if total_bases == 0 else total_coverage / total_bases
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mount-root', required=True) parser.add_argument('--tmp-dir', required=True) parser.add_argument('--task', required=False, default=MOUNT_DATA_STORAGES) args = parser.parse_args() if EXEC_ENVIRONMENT in os.environ and os.environ[EXEC_ENVIRONMENT] == DTS: Logger.success( 'Skipping cloud storage mount for execution environment %s' % DTS, task_name=args.task) return MountStorageTask(args.task).run(args.mount_root, args.tmp_dir)
def fetch_dts_registry(self): result = {} try: dts_data = self.api.load_dts_registry() except BaseException as e: Logger.info("DTS is not available: %s" % e.message, task_name=self.task_name) return result for registry in dts_data: for prefix in registry['prefixes']: result[prefix] = registry['url'] return result
def __wait_run_completion(self, run_id): current_status = self.api.load_run(run_id)['status'] while current_status == 'RUNNING': Logger.info('Run %d status is %s. Waiting for completion...' % (run_id, current_status), task_name=self.task) time.sleep(60) current_status = self.api.load_run(run_id)['status'] Logger.info('Run %d finished with status %s' % (run_id, current_status), task_name=self.task) return current_status
def run(self): Logger.info("Reading %s file to collect variants metrics." % self.vcf_file, task_name=self.task) with open(self.output_file, 'w+') as output, open(self.vcf_file, 'r') as vcf: self.__write_header(output) lines_started = False for vcf_line in vcf.readlines(): if lines_started and vcf_line: self.__process_variant(output, vcf_line) elif vcf_line.startswith("#CHROM"): lines_started = True
def _build_remote_path(self, path, input_type, path_type): if input_type == ParameterType.OUTPUT_PARAMETER: local_path = self.analysis_dir else: remote = urlparse.urlparse(path) relative_path = path.replace( '%s://%s' % (remote.scheme, remote.netloc), '') local_dir = self.get_local_dir(input_type) local_path = self.join_paths(local_dir, relative_path) Logger.info('Found %s %s path %s. It will be localized to %s.' % (path_type.lower(), input_type, path, local_path), task_name=self.task_name) return LocalizedPath(path, path, local_path, path_type)
def get_variable_value(variable_name): Logger.log_task_event(GENERATE_INPUTS_TASK, "Getting value of: {}".format(variable_name)) if not os.environ.get(variable_name): return variable_value = os.environ.get(env_key) if VARIABLE_DELIMITER in variable_value: variable_value = [x for x in variable_value.split(VARIABLE_DELIMITER) if x] Logger.log_task_event(GENERATE_INPUTS_TASK, "Value of {}:\n{}".format(variable_name, variable_value)) return variable_value
def run(self, worker_pods, path, run_id): try: Logger.info('Creating hostfile {}'.format(path), task_name=self.task_name) with open(path, 'w') as file: master_pod = self.kube.get_pod(run_id) file.write('{}\n'.format(master_pod.name)) for pod in worker_pods: file.write('{}\n'.format(pod.name)) self.add_to_hosts(pod) Logger.success('Successfully created hostfile {}'.format(path), task_name=self.task_name) except Exception as e: self.fail_task(e.message)
def await_workers_start(self, nodes_number, parent_id): if nodes_number == 0: Logger.success( 'No workers requested. Processing will run on a master node', task_name=self.task_name) return [] try: Logger.info('Waiting for {} worker node(s)'.format(nodes_number), task_name=self.task_name) # TODO: probably we shall check several times, as it is possible that workers are not yet submitted worker_ids = self.get_workers(parent_id) total_number = len(worker_ids) started = [] # approximately 10 minutes attempts = 60 while len(started) != total_number and attempts != 0: started = self.get_started_workers(worker_ids) attempts -= 1 Logger.info('Started {} worker(s) of {} total'.format( len(started), total_number), task_name=self.task_name) time.sleep(10) if len(started) != total_number: raise RuntimeError('Failed to start all workers') Logger.success('All workers started', task_name=self.task_name) return started except Exception as e: self.fail_task(e.message)
def run(self): analysis_folder = os.environ['ANALYSIS_FOLDER'] machine_run_folder = os.environ['MACHINE_RUN_FOLDER'] sample_sheet = os.environ['SAMPLE_SHEET'] Logger.info('Starting analytical processing for sample sheet %s' % sample_sheet, task_name=self.task) samples = SampleSheetParser( sample_sheet, [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet() launched_runs = {} for sample in samples: Logger.info('Starting "%s" sample processing.' % sample[SAMPLE_NAME], task_name=self.task) launched_runs[sample[SAMPLE_NAME]] = self.__run_sample( sample[SAMPLE_NAME], analysis_folder, machine_run_folder) failed_runs = self.__wait_runs_completion(launched_runs) if failed_runs: for sample, run_id in failed_runs.iteritems(): Logger.fail( 'Processing failed for sample "%s". Check run %d logs for more information.' % (sample, run_id), task_name=self.task) sys.exit(1) Logger.success("All samples processed successfully.", task_name=self.task)
def read(cls, report_file, task): Logger.info("Reading InsertSizeMetrics report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: data_started = False for line in report.readlines(): if data_started and line: chunks = line.split("\t") # MEDIAN_INSERT_SIZE return int(chunks[0]) elif line.startswith("MEDIAN_INSERT_SIZE"): data_started = True return 0
def find_files(self, recursive=False): Logger.info("Starting parsing input directory: {}.".format( self.folder), task_name=self.TASK_NAME) all_files = bucket.ls_s3(self.folder, self.MAX_ATTEMPTS, recursive=recursive) result = [[] for x in xrange(len(all_files))] index = 0 for file in all_files: result[index].append(os.path.join(self.folder, file)) index = index + 1 Logger.success("Found {} directories to process.".format(len(result)), task_name=self.TASK_NAME) return result
def read(cls, report_file, task): Logger.info("Reading MarkDuplicates report from file %s." % report_file, task_name=task) with open(report_file, 'r') as report: data_started = False for line in report.readlines(): if data_started and line: chunks = line.split("\t") # UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES return int( chunks[5]) + 2 * int(chunks[6]) + 2 * int(chunks[7]) elif line.startswith("LIBRARY"): data_started = True return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--parameter', type=str, required=True, nargs='*') parser.add_argument('--task-name', required=True) parser.add_argument('--run-id', required=True, type=int) args = parser.parse_args() status = StatusEntry(TaskStatus.SUCCESS) try: node = WaitForNode().await_node_start(args.parameter, args.task_name, args.run_id) print(node.name + " " + node.ip) exit(0) except Exception as e: Logger.warn(e.message) status = StatusEntry(TaskStatus.FAILURE) if status.status == TaskStatus.FAILURE: raise RuntimeError('Failed to setup cluster')
def pipe_log(message, status=TaskStatus.RUNNING): global api_token global api_url global script_path global current_run_id if api_url and api_token: Logger.log_task_event(NODEUP_TASK, '[{}] {}'.format(current_run_id, message), run_id=current_run_id, instance=str(current_run_id), log_dir=script_path, api_url=api_url, status=status, omit_console=True) else: # Log as always logging.info(message)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--nodes_number', type=int, required=True) args = parser.parse_args() run_id = os.environ['RUN_ID'] hostfile = os.environ['DEFAULT_HOSTFILE'] status = StatusEntry(TaskStatus.SUCCESS) workers = [] try: workers = CreateWorkerNodes().await_workers_start( args.nodes_number, run_id) BuildHostfile().run(workers, hostfile, run_id) except Exception as e: Logger.warn(e.message) status = StatusEntry(TaskStatus.FAILURE) ShutDownCluster().run(workers, status) if status.status == TaskStatus.FAILURE: raise RuntimeError('Failed to setup cluster')
def __wait_runs_completion(self, launched_runs): finished = {} failed = {} while True: for sample, run_id in launched_runs.iteritems(): current_status = self.api.load_run(run_id)['status'] Logger.info('Processing sample: %s. Run %d status is %s.' % (sample, run_id, current_status), task_name=self.task) if current_status != 'RUNNING': finished[sample] = run_id if current_status != 'SUCCESS': failed[sample] = run_id if len(finished) == len(launched_runs): Logger.info("Processing for all samples completed.", task_name=self.task) return failed time.sleep(60)
def child_run_active(self): if self.child_id is None: return False attempts = 0 while attempts < self.RETRY_COUNT: try: run = self.api.load_run(self.child_id) return run['status'] == 'RUNNING' except Exception as e: Logger.warn( "Failed to fetch child run ID '' status: {}.".format( str(self.child_id), e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch child run status.") raise RuntimeError( "Exceeded maximum attempts to fetch child run status.")
def get_running_samples(self): attempts = 0 while attempts < self.RETRY_COUNT: try: child_runs = self.api.load_child_pipelines(self.run_id) count = 0 for run in child_runs: if run['status'] == 'RUNNING': count = count + 1 return count except Exception as e: Logger.warn("Failed to fetch running samples: {}.".format( e.message), task_name=self.TASK_NAME) attempts = attempts + 1 time.sleep(self.POLL_TIMEOUT) Logger.fail("Exceeded maximum attempts to fetch running samples.") raise RuntimeError( "Exceeded maximum attempts to fetch running samples.")
def transfer_dts(self, dts_locations, dts_registry, upload, rules=None): grouped_paths = {} for path in dts_locations: if path.prefix not in grouped_paths: grouped_paths[path.prefix] = [path] else: grouped_paths[path.prefix].append(path) for prefix, paths in grouped_paths.iteritems(): dts_url = dts_registry[prefix] Logger.info( 'Uploading {} paths using DTS service {}'.format( len(paths), dts_url), self.task_name) dts_client = DataTransferServiceClient(dts_url, self.token, self.api_url, self.token, 10) dts_client.transfer_data( [self.create_dts_path(path, upload, rules) for path in paths], self.task_name)