def submit_job(db_settings, row, raw_data, indices, threads, jobs_folder): """Generate and export job file to a specific folder""" kwargs = { "pair": ('pair' in row[0]), "dUTP": ('dUTP' in row[0]), "workflow": os.path.splitext(os.path.basename(row[1]))[0], "template": row[2], "uid": row[3], "genome_db": row[4], # not used "genome": row[5], "annotation": row[6], # not used "annottable": row[7], # not used "spike": ('spike' in row[8]), "forcerun": (int(row[9]) == 1), "clip_5p_end": int(row[10]), "clip_3p_end": int(row[11]), "raw_data": raw_data, "indices": indices, "threads": threads } jobs_folder = jobs_folder if os.path.isabs(jobs_folder) else os.path.join(os.getcwd(), jobs_folder) # Setting values to fill in job template # We always create both upstream and downstream, even if we gonna use only upstream kwargs["fastq_file_upstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '.fastq') kwargs["fastq_file_downstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '_2.fastq') kwargs["star_indices_folder"] = os.path.join(kwargs["indices"], STAR_INDICES, kwargs["genome"]) # we need to add RIBO_SUFFIX to "genome" to get folder name for ribosomal bowtie indices kwargs["bowtie_indices_folder"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"]+RIBO_SUFFIX) kwargs["chrom_length"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"], CHR_LENGTH_GENERIC_TSV) kwargs["annotation_input_file"] = os.path.join(kwargs["indices"], ANNOTATIONS, kwargs["genome"],ANNOTATION_GENERIC_TSV) kwargs["exclude_chr"] = "control" if kwargs["spike"] else "" kwargs["output_folder"] = os.path.join(kwargs["raw_data"], kwargs["uid"]) job_file_basename = kwargs["workflow"] + '-' + kwargs["uid"] + '.json' output_filename = os.path.join(jobs_folder, JOBS_NEW, job_file_basename) running_filename = os.path.join(jobs_folder, JOBS_RUNNING, job_file_basename) if not os.path.isfile(kwargs["fastq_file_upstream"]) or (kwargs['pair'] and not os.path.isfile(kwargs["fastq_file_downstream"])): raise BiowFileNotFoundException(kwargs["uid"]) filled_job_object = remove_not_set_inputs(json.loads(kwargs['template'].replace('\n', ' ').format(**kwargs).replace("'True'",'true').replace("'False'",'false').replace('"True"','true').replace('"False"','false'))) filled_job_str = json.dumps(collections.OrderedDict(sorted(filled_job_object.items())),indent=4) # Check if file exists in running or new job folder if os.path.isfile(output_filename) or os.path.isfile(running_filename): raise BiowJobException(kwargs['uid'], message="Duplicate job file [{}]. It has been already created".format(job_file_basename)) try: with open(output_filename, 'w') as output_file: output_file.write(filled_job_str) except Exception as ex: raise BiowJobException(kwargs['uid'], message="Failed to write job file: "+str(ex))
def check_job(db_settings, uid, workflow, jobs_folder): """Check status for current job from Airflow DB""" tasks, total = get_tasks(uid, db_settings) tasks = {k: v for k, v in tasks.iteritems() if v} if not tasks: failed_file = os.path.join( jobs_folder, JOBS_FAIL, os.path.splitext(os.path.basename(workflow))[0] + '-' + uid + '.json') if os.path.isfile( failed_file ): # If job file was moved to failed folder before even started raise BiowJobException( uid, message="Job file is already marked as failed one") return None, None if tasks.get("failed"): raise BiowWorkflowException(uid, message=tasks) elif total > 0 and len(tasks.get( "success", [])) == total: # All the tasks exit with success return LIBSTATUS["SUCCESS_PROCESS"], "Complete" else: percent_complete = 0 try: percent_complete = int( float(len(tasks.get("success", []))) / total * 100) except ZeroDivisionError: pass return LIBSTATUS["PROCESSING"], "Processing: " + str( percent_complete) + "%"
def raise_if_dag_exists(uid, db_settings): """Raise if DAG run with the dag_id already exists""" try: raise_if_dag_absent(uid, db_settings) except BiowBasicException: pass else: raise BiowJobException(uid, message='Duplicate dag_id. Use ForceRun')
def raise_if_file_exists(uid, filename): try: raise_if_file_absent(uid, filename) except BiowBasicException: pass else: raise BiowJobException( uid, message="File already exists {0}".format(filename))
def raise_if_table_exists(db_settings, uid, table, db): try: raise_if_table_absent(db_settings, uid, table, db) except BiowBasicException: pass else: raise BiowJobException(uid, message="Table {0}.{1} already exists".format( db, table))
def submit_job(db_settings, row, raw_data, indices, threads, jobs_folder): """Generate and export job file to a specific folder""" kwargs = { "pair": ('pair' in row[0]), "workflow": os.path.splitext(os.path.basename(row[1]))[0], "template": row[2], "genome_db": row[3], "genome": row[4], "uid": row[6], "exp_fragment_size": int(row[7]), "force_fragment_size": (int(row[8]) == 1), "forcerun": (int(row[9]) == 1), "clip_5p_end": int(row[10]), "clip_3p_end": int(row[11]), "broad_peak": (int(row[12]) == 2), "remove_duplicates": (int(row[13]) == 1), "genome_size": row[14], "control_id": row[15], "raw_data": raw_data, "indices": indices, "threads": threads } jobs_folder = jobs_folder if os.path.isabs(jobs_folder) else os.path.join( os.getcwd(), jobs_folder) # We always create both upstream and downstream, even if we gonna use only upstream kwargs["fastq_file_upstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '.fastq') kwargs["fastq_file_downstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '_2.fastq') kwargs["bowtie_indices_folder"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"]) kwargs["chrom_length"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"], CHR_LENGTH_GENERIC_TSV) kwargs["annotation_input_file"] = os.path.join(kwargs["indices"], ANNOTATIONS, kwargs["genome"], ANNOTATION_GENERIC_TSV) kwargs["output_folder"] = os.path.join(kwargs["raw_data"], kwargs["uid"]) output_filename = os.path.join( jobs_folder, JOBS_NEW, kwargs["workflow"] + '-' + kwargs["uid"] + '.json') running_filename = os.path.join( jobs_folder, JOBS_RUNNING, kwargs["workflow"] + '-' + kwargs["uid"] + '.json') try: kwargs["control_file"] = get_control(db_settings, **kwargs) except BiowFileNotFoundException: raise if not os.path.isfile(kwargs["fastq_file_upstream"]) or ( kwargs['pair'] and not os.path.isfile(kwargs["fastq_file_downstream"])): raise BiowFileNotFoundException(kwargs["uid"]) filled_job_object = remove_not_set_inputs( json.loads(kwargs['template'].replace( '\n', ' ').format(**kwargs).replace("'True'", 'true').replace( "'False'", 'false').replace('"True"', 'true').replace('"False"', 'false'))) filled_job_str = json.dumps(collections.OrderedDict( sorted(filled_job_object.items())), indent=4) # Check if file exists in job folder (running or new) if os.path.isfile(output_filename) or os.path.isfile(running_filename): raise BiowJobException( kwargs['uid'], message="Duplicate job file. It has been already created") try: with open(output_filename, 'w') as output_file: output_file.write(filled_job_str) except Exception as ex: raise BiowJobException(kwargs['uid'], message="Failed to write job file: " + str(ex))
def raise_if_dag_absent(uid, db_settings): if not get_last_dag_id(uid, db_settings): raise BiowJobException(uid, message='DAG is not found')