def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification']['assembly_software'] assembly_dataset_id = busco_option_dict['identification']['assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data = busco_option_dict['BUSCO parameters']['lineage_data'] lineage_data_file = '{0}.tar.gz'.format(lineage_data) lineage_data_url = 'http://busco.ezlab.org/v2/datasets/{0}'.format(lineage_data_file) mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters']['augustus_options'].upper() # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('BUSCO_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('export PATH=$BUSCO_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_busco_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function download_lineage_data')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Downloading lineage data ..."')) file_id.write('{0}\n'.format(' wget --quiet --output-document ./{0} {1}'.format(lineage_data_file, lineage_data_url))) file_id.write('{0}\n'.format(' tar -xzvf ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format(' rm ./{0}'.format(lineage_data_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_busco_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' run_BUSCO.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' run_BUSCO.py \\')) file_id.write('{0}\n'.format(' --cpu={0} \\'.format(ncpu))) file_id.write('{0}\n'.format(' --lineage_path=./{0} \\'.format(lineage_data))) file_id.write('{0}\n'.format(' --mode={0} \\'.format(mode))) file_id.write('{0}\n'.format(' --evalue={0} \\'.format(evalue))) file_id.write('{0}\n'.format(' --limit={0} \\'.format(limit))) if species.upper() != 'NONE': file_id.write('{0}\n'.format(' --species={0} \\'.format(species))) if long == 'YES': file_id.write('{0}\n'.format(' --long \\')) if augustus_options.upper() != 'NONE': file_id.write('{0}\n'.format(" --august_options='{0}' \\".format(augustus_options))) file_id.write('{0}\n'.format(' --in={0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' --out={0}'.format(os.path.basename(current_run_dir)))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('download_lineage_data')) file_id.write('{0}\n'.format('run_busco_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(get_busco_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def validate_read_transfer_config_file(strict): ''' Validate the read transfer config file. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the read transfer config file read_transfer_config_file = get_read_transfer_config_file() # get the options dictionary read_transfer_options_dict = xlib.get_option_dict( read_transfer_config_file) try: read_transfer_options_dict = xlib.get_option_dict( read_transfer_config_file) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in read_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = read_transfer_options_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "file-1" if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" for section in sections_list: if section not in ['identification']: # verify than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append( '*** ERROR: the section "{0}" has a wrong identification.' .format(section)) OK = False else: # check section "file-n" - key "local_path" local_path = read_transfer_options_dict.get( section, {}).get('local_path', not_found) if local_path == not_found: error_list.append( '*** ERROR: the key "local_path" is not found in the section "{0}".' .format(section)) OK = False else: try: open(local_path, mode='r').close() except FileNotFoundError: if strict: error_list.append( '*** ERROR: the file {0} in the key "local_path" of the section "{1}" does not exist or it is not accessible.' .format(local_path, section)) OK = False else: error_list.append( '*** WARNING: the file {0} in the key "local_path" of the section "{1}" does not exist or it is not accessible.' .format(local_path, section)) except OSError: error_list.append( '*** ERROR: the file name "{0}" in the key "local_path" of the section "{1}" is not correct.' .format(local_path, section)) OK = False # warn that the reads config file is not valid if there are any errors if not OK: error_list.append( '\nThe read transfer config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def check_htseq_count_config_file(strict): ''' Check the htseq-count config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: htseq_count_option_dict = xlib.get_option_dict( get_htseq_count_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in htseq_count_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = htseq_count_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = htseq_count_option_dict.get( 'identification', {}).get('reference_dataset_id', not_found) if reference_dataset_id == not_found: error_list.append( '*** ERROR: the key "reference_dataset_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "annotation_file" annotation_file = htseq_count_option_dict.get( 'identification', {}).get('annotation_file', not_found) if annotation_file == not_found: error_list.append( '*** ERROR: the key "annotation_file" is not found in the section "identification".' ) OK = False elif os.path.splitext(annotation_file)[1] not in ['.gtf', '.gff']: error_list.append( '*** ERROR: the key "annotation_file" has to be a file name with .gtf/.gff extension.' ) OK = False # check section "alignment-dataset-1" if 'alignment-dataset-1' not in sections_list: error_list.append( '*** ERROR: the section "alignment-dataset-1" is not found.') OK = False # check all sections "alignment-dataset-n" for section in sections_list: if section not in ['identification', 'htseq-count parameters']: # check than the section identification is like alignment-dataset-n if not re.match('^alignment-dataset-[0-9]+$', section): error_list.append( f'*** ERROR: the section "{section}" has a wrong identification.' ) OK = False else: # check section "alignment-dataset-n" - key "alignment_software" alignment_software = htseq_count_option_dict.get( section, {}).get('alignment_software', not_found) if alignment_software == not_found: error_list.append( f'*** ERROR: the key "alignment_software" is not found in the section "{section}".' ) OK = False elif not xlib.check_code( alignment_software, get_alignment_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.' ) OK = False # check section "alignment-dataset-n" - key "alignment_dataset_id" alignment_dataset_id = htseq_count_option_dict.get( section, {}).get('alignment_dataset_id', not_found) if alignment_dataset_id == not_found: error_list.append( f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".' ) OK = False elif not xlib.check_startswith( alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.' ) OK = False # check section "htseq-count parameters" if 'htseq-count parameters' not in sections_list: error_list.append( '*** ERROR: the section "htseq-count parameters" is not found.' ) OK = False else: # check section "htseq-count parameters" - key "nprocesses" nprocesses = htseq_count_option_dict.get( 'htseq-count parameters', {}).get('nprocesses', not_found) if nprocesses == not_found: error_list.append( '*** ERROR: the key "nprocesses" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_int(nprocesses, minimum=1): error_list.append( '*** ERROR: the key "nprocesses" has to be an integer number greater than or equal to 1.' ) OK = False # check section "htseq-count parameters" - key "stranded" stranded = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'stranded', not_found) if stranded == not_found: error_list.append( '*** ERROR: the key "stranded" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code( stranded, get_stranded_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "stranded" has to be {get_stranded_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "minaqual" minaqual = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'minaqual', not_found) if minaqual == not_found: error_list.append( '*** ERROR: the key "minaqual" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_int(minaqual): error_list.append( '*** ERROR: the key "minaqual" has to be an integer number.' ) OK = False # check section "htseq-count parameters" - key "type" type = htseq_count_option_dict.get('htseq-count parameters', {}).get('type', not_found) if type == not_found: error_list.append( '*** ERROR: the key "type" is not found in the section "htseq-count parameters".' ) OK = False # check section "htseq-count parameters" - key "idattr" idattr = htseq_count_option_dict.get('htseq-count parameters', {}).get('idattr', not_found) if idattr == not_found: error_list.append( '*** ERROR: the key "idattr" is not found in the section "htseq-count parameters".' ) OK = False # check section "htseq-count parameters" - key "mode" mode = htseq_count_option_dict.get('htseq-count parameters', {}).get('mode', not_found) if mode == not_found: error_list.append( '*** ERROR: the key "mode" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code( mode, get_mode_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "nonunique" nonunique = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'nonunique', not_found) if nonunique == not_found: error_list.append( '*** ERROR: the key "nonunique" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code(nonunique, get_nonunique_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "nonunique" has to be {get_nonunique_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "other_parameters" not_allowed_parameters_list = [ 'nprocesses', 'format', 'stranded', 'minaqual', 'type', 'idattr', 'mode', 'nonunique', 'quiet' ] other_parameters = htseq_count_option_dict.get( 'htseq-count parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "htseq-count parameters".' ) OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_htseq_count_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def run_gzip_process(cluster_name, dataset_type, log, function=None): ''' Run a gzip process. ''' # initialize the control variable OK = True # get the gzip code and name gzip_code = xlib.get_gzip_code() gzip_name = xlib.get_gzip_name() # get the gzip option dictionary gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type)) # get the experiment identification experiment_id = gzip_option_dict['identification']['experiment_id'] # get the gzip process script path in the local computer gzip_process_script = get_gzip_process_script(dataset_type) # get the gzip process starter path in the local computer gzip_process_starter = get_gzip_process_starter(dataset_type) # warn that the log window does not have to be closed if not isinstance(log, xlib.DevStdOut): log.write('This process might take several minutes. Do not close this window, please wait!\n') # check the gzip config file log.write(f'{xlib.get_separator()}\n') log.write('Checking the {0} config file ...\n'.format(gzip_name)) (OK, error_list) = check_gzip_config_file(dataset_type, strict=True) if OK: log.write('The file is OK.\n') else: log.write('*** ERROR: The config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') # create the SSH client connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SSH client ...\n') (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name) if OK: log.write('The SSH client is connected.\n') else: for error in error_list: log.write(f'{error}\n') # create the SSH transport connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SSH transport ...\n') (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name) if OK: log.write('The SSH transport is connected.\n') else: for error in error_list: log.write(f'{error}\n') # create the SFTP client if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SFTP client ...\n') sftp_client = xssh.create_sftp_client(ssh_transport) log.write('The SFTP client is connected.\n') # warn that the requirements are being verified if OK: log.write(f'{xlib.get_separator()}\n') log.write('Checking process requirements ...\n') # check the master is running if OK: (master_state_code, master_state_name) = xec2.get_node_state(cluster_name) if master_state_code != 16: log.write(f'*** ERROR: The cluster {cluster_name} is not running. Its state is {master_state_code} ({master_state_name}).\n') OK = False # warn that the requirements are OK if OK: log.write('Process requirements are OK.\n') # determine the run directory in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Determining the run directory in the cluster ...\n') if dataset_type == 'reference': current_run_dir = xlib.get_cluster_current_run_dir('reference', gzip_code) elif dataset_type == 'database': current_run_dir = xlib.get_cluster_current_run_dir('database', gzip_code) else: current_run_dir = xlib.get_cluster_current_run_dir(experiment_id, gzip_code) command = f'mkdir --parents {current_run_dir}' (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write(f'The directory path is {current_run_dir}.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # build the gzip process script if OK: log.write(f'{xlib.get_separator()}\n') log.write('Building the process script {0} ...\n'.format(gzip_process_script)) (OK, error_list) = build_gzip_process_script(cluster_name, dataset_type, current_run_dir) if OK: log.write('The file is built.\n') else: log.write('*** ERROR: The file could not be built.\n') # upload the gzip process script to the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Uploading the process script {0} to the directory {1} ...\n'.format(gzip_process_script, current_run_dir)) cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_script)) (OK, error_list) = xssh.put_file(sftp_client, gzip_process_script, cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write(f'{error}\n') # set run permision to the gzip process script in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_script))) command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_script)) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # build the gzip process starter if OK: log.write(f'{xlib.get_separator()}\n') log.write('Building the process starter {0} ...\n'.format(gzip_process_starter)) (OK, error_list) = build_gzip_process_starter(dataset_type, current_run_dir) if OK: log.write('The file is built.\n') else: log.write('***ERROR: The file could not be built.\n') # upload the gzip process starter to the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Uploading the process starter {0} to the directory {1} ...\n'.format(gzip_process_starter, current_run_dir)) cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_starter)) (OK, error_list) = xssh.put_file(sftp_client, gzip_process_starter, cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write(f'{error}\n') # set run permision to the gzip process starter in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_starter))) command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_starter)) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # submit the gzip process if OK: log.write(f'{xlib.get_separator()}\n') log.write('Submitting the process script {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_starter))) OK = xssh.submit_script(cluster_name, ssh_client, current_run_dir, os.path.basename(gzip_process_starter), log) # close the SSH transport connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Closing the SSH transport connection ...\n') xssh.close_ssh_transport_connection(ssh_transport) log.write('The connection is closed.\n') # close the SSH client connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Closing the SSH client connection ...\n') xssh.close_ssh_client_connection(ssh_client) log.write('The connection is closed.\n') # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write(f'{xlib.get_separator()}\n') log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def build_gzip_process_script(cluster_name, dataset_type, current_run_dir): ''' Build the current gzip process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the gzip option dictionary gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type)) # get the options experiment_id = gzip_option_dict['identification']['experiment_id'] dataset_type_2 = gzip_option_dict['identification']['dataset_type'] dataset_id = gzip_option_dict['identification']['dataset_id'] action = gzip_option_dict['gzip parameters']['action'] # get the sections list sections_list = [] for section in gzip_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the dataset subdirectory and file name lists dataset_subdirectory_list = [] file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): dataset_subdirectory = gzip_option_dict[section]['dataset_subdirectory'] dataset_subdirectory_list.append(dataset_subdirectory) file_name = gzip_option_dict[section]['file_name'] file_name_list.append(file_name) # get the dataset directory if dataset_type_2 == 'reference': dataset_dir = xlib.get_cluster_reference_dataset_dir(dataset_id) elif dataset_type_2 == 'database': dataset_dir = xlib.get_cluster_database_dataset_dir(dataset_id) elif dataset_type_2 == 'read': dataset_dir = xlib.get_cluster_experiment_read_dataset_dir(experiment_id, dataset_id) elif dataset_type_2 == 'result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id) elif dataset_type_2 == 'whole-result': dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id) # write the gzip process script try: if not os.path.exists(os.path.dirname(get_gzip_process_script(dataset_type_2))): os.makedirs(os.path.dirname(get_gzip_process_script(dataset_type_2))) with open(get_gzip_process_script(dataset_type_2), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( '{0}\n'.format('function run_gzip_process')) script_file_id.write( '{\n') if dataset_type_2 in ['reference', 'database', 'read', 'result']: script_file_id.write(f' cd {current_run_dir}\n') for i in range(len(dataset_subdirectory_list)): script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Compressing/decompressing {0}/{1}/{2} ..."'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) if action == 'compress': script_file_id.write( '{0}\n'.format(' gzip {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) elif action == 'decompress': script_file_id.write( '{0}\n'.format(' gzip --decompress {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i]))) script_file_id.write( ' RC=$?\n') script_file_id.write( '{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error gzip $RC; fi')) elif dataset_type_2 == 'whole-result': script_file_id.write(f' cd {current_run_dir}\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Compressing/decompressing {0} ..."'.format(dataset_dir))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) if action == 'compress': script_file_id.write( '{0}\n'.format(' tar --create --gzip --verbose --file={0}.tar.gz {0}'.format(dataset_dir))) elif action == 'decompress': script_file_id.write( '{0}\n'.format(' tar --extract --gzip --verbose --file={0} --directory=/'.format(dataset_dir))) script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( '{0}\n'.format(' echo "Removing {0} ..."'.format(dataset_dir))) script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write( '{0}\n'.format(' --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) script_file_id.write( '{0}\n'.format(' rm -rf {0}'.format(dataset_dir))) script_file_id.write( ' RC=$?\n') script_file_id.write( '{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error rm $RC; fi')) script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_gzip_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( '{0}\n'.format('run_gzip_process')) script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The file {0} can not be created'.format(get_gzip_process_script(dataset_type_2))) OK = False # return the control variable and the error list return (OK, error_list)
def validate_gmap_config_file(strict): ''' Validate the GMAP config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in gmap_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = gmap_option_dict.get('identification', {}).get('experiment_id', not_found) is_experiment_id_OK = True if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') is_experiment_id_OK = False OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = gmap_option_dict.get('identification', {}).get('reference_dataset_id', not_found) is_reference_dataset_id_OK = True if reference_dataset_id == not_found: error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".') is_reference_dataset_id_OK = False OK = False # check section "identification" - key "reference_file" reference_file = gmap_option_dict.get('identification', {}).get('reference_file', not_found) is_reference_file_OK = True if reference_file == not_found: error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".') is_reference_file_OK = False OK = False # check section "identification" - key "assembly_software" assembly_software = gmap_option_dict.get('identification', {}).get('assembly_software', not_found) is_assembly_software_OK = True if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') is_assembly_software_OK = False OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) is_assembly_software_OK = False OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = gmap_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) is_assembly_dataset_id_OK = True if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') is_assembly_dataset_id_OK = False OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) is_assembly_dataset_id_OK = False OK = False # check section "identification" - key "assembly_type" assembly_type = gmap_option_dict.get('identification', {}).get('assembly_type', not_found) is_assembly_type_OK = True if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) is_assembly_type_OK = False OK = False # check section "GMAP parameters" if 'GMAP parameters' not in sections_list: error_list.append('*** ERROR: the section "GMAP parameters" is not found.') OK = False else: # check section "GMAP parameters" - key "threads" threads = gmap_option_dict.get('GMAP parameters', {}).get('threads', not_found) is_threads_OK = True if threads == not_found: error_list.append('*** ERROR: the key "threads" is not found in the section "GMAP parameters".') is_threads_OK = False OK = False else: try: if int(threads) < 1: error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_threads_OK = False OK = False except: error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_threads_OK = False OK = False # check section "GMAP parameters" - key "kmer" kmer = gmap_option_dict.get('GMAP parameters', {}).get('kmer', not_found) is_kmer_OK = True if kmer == not_found: error_list.append('*** ERROR: the key "kmer" is not found in the section "GMAP parameters".') is_kmer_OK = False OK = False else: try: if kmer.upper() != 'NONE' and (int(kmer) < 1 or int(kmer) > 16): error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.') is_kmer_OK = False OK = False except: error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.') is_kmer_OK = False OK = False # check section "GMAP parameters" - key "sampling" sampling = gmap_option_dict.get('GMAP parameters', {}).get('sampling', not_found) is_sampling_OK = True if sampling == not_found: error_list.append('*** ERROR: the key "sampling" is not found in the section "GMAP parameters".') is_sampling_OK = False OK = False else: try: if sampling.upper() != 'NONE' and int(sampling) < 1: error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.') is_sampling_OK = False OK = False except: error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.') is_sampling_OK = False OK = False # check section "GMAP parameters" - key "input-buffer-size" input_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('input-buffer-size', not_found) is_input_buffer_size_OK = True if input_buffer_size == not_found: error_list.append('*** ERROR: the key "input-buffer-size" is not found in the section "GMAP parameters".') is_input_buffer_size_OK = False OK = False else: try: if int(input_buffer_size) < 1: error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_input_buffer_size_OK = False OK = False except: error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_input_buffer_size_OK = False OK = False # check section "GMAP parameters" - key "output-buffer-size" output_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('output-buffer-size', not_found) is_output_buffer_size_OK = True if output_buffer_size == not_found: error_list.append('*** ERROR: the key "output-buffer-size" is not found in the section "GMAP parameters".') is_output_buffer_size_OK = False OK = False else: try: if int(output_buffer_size) < 1: error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_output_buffer_size_OK = False OK = False except: error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.') is_output_buffer_size_OK = False OK = False # check section "GMAP parameters" - key "prunelevel" prunelevel = gmap_option_dict.get('GMAP parameters', {}).get('prunelevel', not_found) is_prunelevel_OK = True if prunelevel == not_found: error_list.append('*** ERROR: the key "prunelevel" is not found in the section "GMAP parameters".') is_prunelevel_OK = False OK = False else: if prunelevel not in ['0', '1', '2', '3']: error_list.append('*** ERROR: the key "prunelevel" in the section "GMAP parameters" must be 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive).') is_prunelevel_OK = False OK = False # check section "GMAP parameters" - key "format" format = gmap_option_dict.get('GMAP parameters', {}).get('format', not_found) is_format_OK = True if format == not_found: error_list.append('*** ERROR: the key "format" is not found in the section "GMAP parameters".') is_format_OK = False OK = False else: if format.upper() not in ['COMPRESS', 'SUMMARY', 'ALIGN', 'PLS', 'GFF3_GENE', 'SPLICESITES', 'INTRONS', 'MAP_EXONS', 'MAP_RANGES', 'COORDS']: error_list.append('*** ERROR: the key "format" in the section "GMAP parameters" must be COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS.') is_format_OK = False OK = False # check section "GMAP parameters" - key "other_parameters" not_allowed_parameters_list = ['nthreads', 'kmer', 'sampling', 'input-buffer-size', 'output-buffer-size', 'prunelevel', 'compress', 'summary', 'align', 'format' ] other_parameters = gmap_option_dict.get('GMAP parameters', {}).get('other_parameters', not_found) is_other_parameters_OK = True if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "GMAP parameters".') is_other_parameters_OK = False OK = False else: if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append('*** ERROR: the value of the key "other_parameters" in the section "GMAP parameters" must be NONE or a valid parameter list.') is_other_parameters_OK = False OK = False break else: if parameter_name in not_allowed_parameters_list: error_list.append('*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "GMAP parameters" because it is controled by NGScloud.'.format(parameter_name)) is_other_parameters_OK = False OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gmap_name())) # return the control variable and the error list return (OK, error_list)
def upload_database_dataset(cluster_name, log, function=None): ''' Upload the database dataset to the cluster. ''' # initialize the control variable OK = True # warn that the log window does not have to be closed if not isinstance(log, xlib.DevStdOut): log.write( 'This process might take several minutes. Do not close this window, please wait!\n' ) # check the database transfer config file log.write(f'{xlib.get_separator()}\n') log.write('Checking the database transfer config file ...\n') if check_database_transfer_config_file(strict=True): log.write('The file is OK.\n') else: log.write( '*** ERROR: The database transfer config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') OK = False # create the SSH client connection if OK: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name) for error in error_list: log.write(f'{error}\n') # create the SSH transport connection if OK: (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name) for error in error_list: log.write(f'{error}\n') # create the SFTP client if OK: sftp_client = xssh.create_sftp_client(ssh_transport) # upload the database dataset if OK: # get the option dictionary database_transfer_options_dict = xlib.get_option_dict( get_database_transfer_config_file()) # get the database dataset identification and the local directory of the database files database_dataset_id = database_transfer_options_dict['identification'][ 'database_dataset_id'] local_dir = database_transfer_options_dict['identification'][ 'local_dir'] # set the cluster database directory cluster_database_dir = '{0}/{1}'.format( xlib.get_cluster_database_dir(), database_dataset_id) # create the data directory in the cluster log.write(f'{xlib.get_separator()}\n') log.write( 'The database directory {0} in the cluster is being created ...\n'. format(cluster_database_dir)) command = 'mkdir --parents {0}'.format(cluster_database_dir) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The directory is created.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # get the sections list sections_list = [] for section in database_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # for each section "file-n" for section in sections_list: # check than the section identification is like file-n if re.match('^file-[0-9]+$', section): # get the file name file_name = database_transfer_options_dict[section][ 'file_name'] # set the local path and cluster path local_path = '{0}/{1}'.format(local_dir, file_name) cluster_path = '{0}/{1}'.format(cluster_database_dir, file_name) # upload the database file to the cluster log.write(f'{xlib.get_separator()}\n') log.write('The file {0} is being uploaded to {1} ...\n'.format( file_name, cluster_database_dir)) (OK, error_list) = xssh.put_file(sftp_client, local_path, cluster_path) if OK: log.write('The file has been uploaded.\n') else: for error in error_list: log.write(f'{error}\n') break # close the SSH transport connection if OK: xssh.close_ssh_transport_connection(ssh_transport) # close the SSH client connection if OK: xssh.close_ssh_client_connection(ssh_client) # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write(f'{xlib.get_separator()}\n') log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def build_express_process_script(cluster_name, current_run_dir): ''' Build the current eXpress process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the eXpress option dictionary express_option_dict = xlib.get_option_dict(get_express_config_file()) # get the options experiment_id = express_option_dict['identification']['experiment_id'] assembly_software = express_option_dict['identification']['assembly_software'] assembly_dataset_id = express_option_dict['identification']['assembly_dataset_id'] assembly_type = express_option_dict['identification']['assembly_type'] frag_len_mean = express_option_dict['eXpress parameters']['frag-len-mean'] frag_len_stddev = express_option_dict['eXpress parameters']['frag-len-stddev'] library_type = express_option_dict['eXpress parameters']['library_type'] max_indel_size = express_option_dict['eXpress parameters']['max-indel-size'] no_bias_correct = express_option_dict['eXpress parameters']['no-bias-correct'] no_error_model = express_option_dict['eXpress parameters']['no-error-model'] other_parameters = express_option_dict['eXpress parameters']['other_parameters'] # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # build alignment dataset identification list alignment_software_list = [] alignment_dataset_id_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^alignment-dataset-[0-9]+$', section): alignment_software_list.append(express_option_dict[section]['alignment_software']) alignment_dataset_id_list.append(express_option_dict[section]['alignment_dataset_id']) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the eXpress process script try: if not os.path.exists(os.path.dirname(get_express_process_script())): os.makedirs(os.path.dirname(get_express_process_script())) with open(get_express_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write( '#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n') script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n') script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n') script_file_id.write( 'mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function init\n') script_file_id.write( '{\n') script_file_id.write( ' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write( ' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write( ' echo "HOST IP: $HOST_IP"\n') script_file_id.write( ' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function run_express_process\n') script_file_id.write( '{\n') script_file_id.write(f' source activate {xlib.get_express_anaconda_code()}\n') script_file_id.write(f' cd $CURRENT_DIR\n') for i in range(len(alignment_dataset_id_list)): alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam' script_file_id.write(f' SORTED_BAM_LIST={alignment_dataset_id_list[i]}-sorted-bam-files.txt\n') script_file_id.write(f' ls {alignment_files} > $SORTED_BAM_LIST\n') script_file_id.write( ' while read FILE_BAM; do\n') script_file_id.write( ' NAME=`basename $FILE_BAM`\n') script_file_id.write( ' NAME=${NAME:0:-11}\n') script_file_id.write(f' SUBDIR={alignment_dataset_id_list[i]}-$NAME\n') script_file_id.write(f' mkdir --parents $CURRENT_DIR/$SUBDIR\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write(f' echo "Quantitating alignment dataset {alignment_dataset_id_list[i]} - library $SUBDIR ..."\n') script_file_id.write( ' /usr/bin/time \\\n') script_file_id.write(f' --format="{xlib.get_time_output_format(separator=False)}" \\\n') script_file_id.write( ' express \\\n') script_file_id.write( ' --no-update-check \\\n') script_file_id.write(f' --frag-len-mean {frag_len_mean} \\\n') script_file_id.write(f' --frag-len-stddev {frag_len_stddev} \\\n') if library_type.lower() == 'fr-stranded': script_file_id.write( ' --fr-stranded \\\n') elif library_type.lower() == 'rf-stranded': script_file_id.write( ' --rf-stranded \\\n') elif library_type.lower() == 'f-stranded': script_file_id.write( ' --f-stranded \\\n') elif library_type.lower() == 'r-stranded': script_file_id.write( ' --r-stranded \\\n') script_file_id.write(f' --max-indel-size {max_indel_size} \\\n') if no_bias_correct.upper() == 'YES': script_file_id.write( ' --no-bias-correct \\\n') if no_error_model.upper() == 'YES': script_file_id.write( ' --no-error-model \\\n') if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write(f' --{parameter_name}={parameter_value} \\\n') else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() script_file_id.write(f' --{parameter_name} \\\n') script_file_id.write( ' --output-dir $CURRENT_DIR/$SUBDIR \\\n') script_file_id.write(f' {transcriptome_file} \\\n') script_file_id.write( ' $FILE_BAM\n') script_file_id.write( ' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error express $RC; fi\n') script_file_id.write( ' echo "Quantitation is done."\n') script_file_id.write( ' done < $SORTED_BAM_LIST\n') script_file_id.write( ' conda deactivate\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function end\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail ok\n') script_file_id.write( ' touch $SCRIPT_STATUS_OK\n') script_file_id.write( ' exit 0\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function manage_error\n') script_file_id.write( '{\n') script_file_id.write( ' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n') script_file_id.write( ' calculate_duration\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n') script_file_id.write( ' echo "$SEP"\n') script_file_id.write( ' send_mail wrong\n') script_file_id.write( ' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write( ' exit 3\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') process_name = f'{xlib.get_express_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name) script_file_id.write( 'function send_mail\n') script_file_id.write( '{\n') script_file_id.write(f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write( ' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write( ' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write( ' else\n') script_file_id.write( ' MESSAGE=""\n') script_file_id.write( ' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write( ' echo "{" > $DESTINATION_FILE\n') script_file_id.write(f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n') script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n') script_file_id.write( ' echo "}" >> $DESTINATION_FILE\n') script_file_id.write( ' MESSAGE_FILE=mail-message.json\n') script_file_id.write( ' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " }" >> $MESSAGE_FILE\n') script_file_id.write( ' echo "}" >> $MESSAGE_FILE\n') script_file_id.write(f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'function calculate_duration\n') script_file_id.write( '{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write( ' HH=`expr $DURATION / 3600`\n') script_file_id.write( ' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write( ' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n') script_file_id.write( '}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n') script_file_id.write( 'init\n') script_file_id.write( 'run_express_process\n') script_file_id.write( 'end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append(f'*** ERROR: The file {get_express_process_script()} can not be created') OK = False # return the control variable and the error list return (OK, error_list)
def check_cd_hit_est_config_file(strict): ''' check the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(threads, minimum=0): error_list.append( '*** ERROR: the key "threads" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(memory_limit, minimum=0): error_list.append( '*** ERROR: the key "memory_limit" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_float( seq_identity_threshold, minimum=0., maximum=1.): error_list.append( '*** ERROR: the key "seq_identity_threshold" has to be a float number between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(word_length, minimum=1): error_list.append( '*** ERROR: the key "word_length" has to be an integer number greater than or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found) if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(match): error_list.append( '*** ERROR: the key "match" has to be an integer number.') OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(mismatch): error_list.append( '*** ERROR: the key "mismatch" has to be an integer number.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_cd_hit_est_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def download_result_dataset(cluster_name, log, function=None): ''' Download the result dataset of a run from the cluster. ''' # initialize the control variable OK = True # get the read transfer config file result_transfer_config_file = get_result_transfer_config_file() # warn that the log window must not be closed if not isinstance(log, xlib.DevStdOut): log.write('This process might take several minutes. Do not close this window, please wait!\n') # get and validate the result transfer config file log.write('{0}\n'.format(xlib.get_separator())) log.write('The result transfer config file is been validating ...\n') if validate_result_transfer_config_file(strict=True): log.write('The config file is OK.\n') else: log.write('*** ERROR: The result transfer config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') OK = False # create the SSH client connection if OK: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master') for error in error_list: log.write('{0}\n'.format(error)) # create the SSH transport connection if OK: (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name, 'master') for error in error_list: log.write('{0}\n'.format(error)) # create the SFTP client if OK: sftp_client = xssh.create_sftp_client(ssh_transport) # get the options dictionary if OK: result_transfer_options_dict = xlib.get_option_dict(result_transfer_config_file) # download the result dataset if OK: # get the sections list sections_list = [] for section in result_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # get the experiment identification, run identification and local directory from the section "identification" experiment_id = result_transfer_options_dict['identification']['experiment_id'] result_dataset_id = result_transfer_options_dict['identification']['result_dataset_id'] status = result_transfer_options_dict['identification']['status'].lower() local_dir = result_transfer_options_dict['identification']['local_dir'] # download files when the status is uncompressed if status == 'uncompressed': # for each section "file-n" for section in sections_list: # verify than the section identification is like file-n if re.match('^file-[0-9]+$', section): # get the dataset subdirectory and file name dataset_subdirectory = result_transfer_options_dict[section]['dataset_subdirectory'] file_name = result_transfer_options_dict[section]['file_name'] # verify if the dataset subdirectory is created pathlib.Path(os.path.normpath('{0}/{1}'.format(local_dir, dataset_subdirectory))).mkdir(parents=True, exist_ok=True) # assign the cluster path and local path cluster_path = '{0}/{1}/{2}/{3}/{4}'.format(xlib.get_cluster_result_dir(), experiment_id, result_dataset_id, dataset_subdirectory, file_name) local_path = os.path.normpath('{0}/{1}/{2}'.format(local_dir, dataset_subdirectory, file_name)) # download the result file from the cluster log.write('{0}\n'.format(xlib.get_separator())) log.write('Downloading the file {0} to {1} ...\n'.format(cluster_path, local_dir)) (OK, error_list) = xssh.get_file(sftp_client, cluster_path, local_path) if OK: log.write('The file has been downloaded.\n') else: for error in error_list: log.write('{0}\n'.format(error)) break # download files when the status is compressed elif status == 'compressed': # assign the cluster path and local path cluster_path = '{0}/{1}/{2}'.format(xlib.get_cluster_result_dir(), experiment_id, result_dataset_id) local_path = '{0}/{1}'.format(local_dir, result_dataset_id) # download the result file from the cluster log.write('{0}\n'.format(xlib.get_separator())) log.write('Downloading the file {0} to {1} ...\n'.format(cluster_path, local_dir)) (OK, error_list) = xssh.get_file(sftp_client, cluster_path, local_path) if OK: log.write('The file has been downloaded.\n') else: for error in error_list: log.write('{0}\n'.format(error)) # close the SSH transport connection if OK: xssh.close_ssh_transport_connection(ssh_transport) # close the SSH client connection if OK: xssh.close_ssh_client_connection(ssh_client) # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write('{0}\n'.format(xlib.get_separator())) log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def check_express_config_file(strict): ''' Check the eXpress config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: express_option_dict = xlib.get_option_dict(get_express_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The option dictionary could not be built from the config file') OK = False else: # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = express_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "assembly_software" assembly_software = express_option_dict.get('identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = express_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_type" assembly_type = express_option_dict.get('identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append(f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.') OK = False # check section "alignment-dataset-1" if 'alignment-dataset-1' not in sections_list: error_list.append('*** ERROR: the section "alignment-dataset-1" is not found.') OK = False # check all sections "alignment-dataset-n" for section in sections_list: if section not in ['identification', 'eXpress parameters']: # check than the section identification is like alignment-dataset-n if not re.match('^alignment-dataset-[0-9]+$', section): error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.') OK = False else: # check section "alignment-dataset-n" - key "alignment_software" alignment_software = express_option_dict.get(section, {}).get('alignment_software', not_found) if alignment_software == not_found: error_list.append(f'*** ERROR: the key "alignment_software" is not found in the section "{section}".') OK = False elif not xlib.check_code(alignment_software, get_alignment_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.') OK = False # check section "alignment-dataset-n" - key "alignment_dataset_id" alignment_dataset_id = express_option_dict.get(section, {}).get('alignment_dataset_id', not_found) if alignment_dataset_id == not_found: error_list.append(f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".') OK = False elif not xlib.check_startswith(alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.') OK = False # check section "eXpress parameters" if 'eXpress parameters' not in sections_list: error_list.append('*** ERROR: the section "eXpress parameters" is not found.') OK = False else: # check section "express parameters" - key "frag-len-mean" frag_len_mean = express_option_dict.get('eXpress parameters', {}).get('frag-len-mean', not_found) if frag_len_mean == not_found: error_list.append('*** ERROR: the key "frag-len-mean" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_mean, minimum=1): error_list.append('*** ERROR: the key "frag-len-mean" has to be an integer number greater than or equal to 1.') OK = False # check section "express parameters" - key "frag-len-stddev" frag_len_stddev = express_option_dict.get('eXpress parameters', {}).get('frag-len-stddev', not_found) if frag_len_stddev == not_found: error_list.append('*** ERROR: the key "frag-len-stddev" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_stddev, minimum=1): error_list.append('*** ERROR: the key "frag-len-stddev" has to be an integer number greater than or equal to 1.') OK = False # check section "eXpress parameters" - key "library_type" library_type = express_option_dict.get('eXpress parameters', {}).get('library_type', not_found) if library_type == not_found: error_list.append('*** ERROR: the key "library_type" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(library_type, get_library_type_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "library_type" has to be {get_library_type_code_list_text()}.') OK = False # check section "eXpress parameters" - key "max-indel-size" max_indel_size = express_option_dict.get('eXpress parameters', {}).get('max-indel-size', not_found) if max_indel_size == not_found: error_list.append('*** ERROR: the key "max-indel-size" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(max_indel_size, minimum=0): error_list.append('*** ERROR: the key "max-indel-size" has to be an integer number greater than or equal to 0.') OK = False # check section "eXpress parameters" - key "no-bias-correct" no_bias_correct = express_option_dict.get('eXpress parameters', {}).get('no-bias-correct', not_found) if no_bias_correct == not_found: error_list.append('*** ERROR: the key "no-bias-correct" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_bias_correct, get_no_bias_correct_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-bias-correct" has to be {get_no_bias_correct_code_list_text()}.') OK = False # check section "eXpress parameters" - key "no-error-model" no_error_model = express_option_dict.get('eXpress parameters', {}).get('no-error-model', not_found) if no_error_model == not_found: error_list.append('*** ERROR: the key "no-error-model" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_error_model, get_no_error_model_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-error-model" has to be {get_no_error_model_code_list_text()}.') OK = False # check section "eXpress parameters" - key "other_parameters" not_allowed_parameters_list = ['no-update-check', 'frag-len-mean', 'frag-len-stddev', 'max-indel-size', 'fr-stranded', 'rf-stranded', 'f-stranded', 'r-stranded', 'no-bias-correct', 'no-error-model', 'output-dir'] other_parameters = express_option_dict.get('eXpress parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "eXpress parameters".') OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append(f'\nThe {xlib.get_express_name()} config file is not valid. Please, correct this file or recreate it.') # return the control variable and the error list return (OK, error_list)
def validate_result_transfer_config_file(strict): ''' Validate the result transfer config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the result transfer config file result_transfer_config_file = get_result_transfer_config_file() # get the options dictionary try: result_transfer_options_dict = xlib.get_option_dict(result_transfer_config_file) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in result_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = result_transfer_options_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False elif not experiment_id.isidentifier(): error_list.append('*** ERROR: the key "experiment_id" value in the section "identification" has some non-alphanumeric characters') OK = False # check section "identification" - key "result_dataset_id" result_dataset_id = result_transfer_options_dict.get('identification', {}).get('result_dataset_id', not_found) if result_dataset_id == not_found: error_list.append('*** ERROR: the key "result_dataset_id" is not found in the section "identification".') OK = False # check section "identification" - key "status" status = result_transfer_options_dict.get('identification', {}).get('status', not_found).lower() if status == not_found: error_list.append('*** ERROR: the key "status" is not found in the section "identification".') OK = False else: if status not in ['compressed', 'uncompressed']: error_list.append('*** ERROR: the key "status" value in the section "identification" must be uncompressed or compressed.') OK = False status == 'WRONG' # check section "identification" - key "local_dir" local_dir = result_transfer_options_dict.get('identification', {}).get('local_dir', not_found) if local_dir == not_found: error_list.append('*** ERROR: the key "local_dir" is not found in the section "identification".') OK = False elif not xlib.existing_dir(local_dir): error_list.append('*** ERROR: the key "local_id" value in the section "identification" is a non existing directory path.') OK = False # check section "file-1" if status == 'uncompressed': if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" if status == 'uncompressed': for section in sections_list: if section not in ['identification']: # verify than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append('*** ERROR: the section "{0}" has a wrong identification.'.format(section)) OK = False else: # check section "file-n" - key "dataset_subdirectory" dataset_subdirectory = result_transfer_options_dict.get(section, {}).get('dataset_subdirectory', not_found) if dataset_subdirectory == not_found: error_list.append('*** ERROR: the key "dataset_subdirectory" is not found in the section "{0}".'.format(section)) OK = False elif not xlib.is_valid_path(dataset_subdirectory, 'linux'): error_list.append('*** ERROR: the file {0} in the key "dataset_subdirectory" of the section "{1}" has a non valid file name.'.format(dataset_subdirectory, section)) OK = False # check section "file-n" - key "file_name" file_name = result_transfer_options_dict.get(section, {}).get('file_name', not_found) if file_name == not_found: error_list.append('*** ERROR: the key "file_name" is not found in the section "{0}".'.format(section)) OK = False elif not xlib.is_valid_path(file_name, 'linux'): error_list.append('*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.'.format(file_name, section)) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe result transfer config file is not valid. Please, correct this file or recreate it.') # return the control variable and the error list return (OK, error_list)
def build_quast_process_script(cluster_name, current_run_dir): ''' Build the current QUAST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the QUAST option dictionary quast_option_dict = xlib.get_option_dict(get_quast_config_file()) # get the options experiment_id = quast_option_dict['identification']['experiment_id'] reference_dataset_id = quast_option_dict['identification']['reference_dataset_id'] reference_file = quast_option_dict['identification']['reference_file'] assembly_software = quast_option_dict['identification']['assembly_software'] assembly_dataset_id = quast_option_dict['identification']['assembly_dataset_id'] assembly_type = quast_option_dict['identification']['assembly_type'] threads = quast_option_dict['QUAST parameters']['threads'] # set the reference file path if reference_dataset_id.upper() != 'NONE': reference_file_path = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # get the QUAST process script name quast_process_script = get_quast_process_script() # write the QUAST process script try: if not os.path.exists(os.path.dirname(quast_process_script)): os.makedirs(os.path.dirname(quast_process_script)) with open(quast_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('QUAST_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$QUAST_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_quast_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_quast_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' quast.py --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' quast.py \\')) file_id.write('{0}\n'.format(' --threads {0} \\'.format(threads))) file_id.write('{0}\n'.format(' --output-dir {0} \\'.format(current_run_dir))) if reference_dataset_id.upper() != 'NONE': file_id.write('{0}\n'.format(' -R {0} \\'.format(reference_file_path))) if assembly_type.upper() == 'SCAFFOLDS': file_id.write('{0}\n'.format(' --scaffolds \\')) file_id.write('{0}\n'.format(' {0}'.format(transcriptome_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error quast.py $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_quast_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(quast_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def validate_quast_config_file(strict): ''' Validate the QUAST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: quast_option_dict = xlib.get_option_dict(get_quast_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in quast_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = quast_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = quast_option_dict.get('identification', {}).get('reference_dataset_id', not_found) if reference_dataset_id == not_found: error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".') OK = False # check section "identification" - key "reference_file" reference_file = quast_option_dict.get('identification', {}).get('reference_file', not_found) if reference_file == not_found: error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".') OK = False # check section "identification" - key "assembly_software" assembly_software = quast_option_dict.get('identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = quast_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_type" assembly_type = quast_option_dict.get('identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) OK = False # check section "QUAST parameters" if 'QUAST parameters' not in sections_list: error_list.append('*** ERROR: the section "QUAST parameters" is not found.') OK = False else: # check section "QUAST parameters" - key "threads" threads = quast_option_dict.get('QUAST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append('*** ERROR: the key "threads" is not found in the section "QUAST parameters".') OK = False else: try: if int(threads) < 1: error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.') OK = False except: error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.') OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_quast_name())) # return the control variable and the error list return (OK, error_list)
def build_busco_process_script(cluster_name, current_run_dir): ''' Build the current BUSCO process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the BUSCO option dictionary busco_option_dict = xlib.get_option_dict(get_busco_config_file()) # get the options experiment_id = busco_option_dict['identification']['experiment_id'] assembly_software = busco_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = busco_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = busco_option_dict['identification']['assembly_type'] ncpu = busco_option_dict['BUSCO parameters']['ncpu'] lineage_data_url = busco_option_dict['BUSCO parameters'][ 'lineage_data_url'] mode = busco_option_dict['BUSCO parameters']['mode'].lower() evalue = busco_option_dict['BUSCO parameters']['evalue'] limit = busco_option_dict['BUSCO parameters']['limit'] species = busco_option_dict['BUSCO parameters']['species'] long = busco_option_dict['BUSCO parameters']['long'].upper() augustus_options = busco_option_dict['BUSCO parameters'][ 'augustus_options'].upper() # get the file and name from the lineage data url lineage_data_file = lineage_data_url.split("/")[-1] # -- lineage_data = lineage_data_file[:lineage_data_file.find('.tar.gz')] point_pos = lineage_data_file.find('.') lineage_data = lineage_data_file[:point_pos] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # write the BUSCO process script try: if not os.path.exists(os.path.dirname(get_busco_process_script())): os.makedirs(os.path.dirname(get_busco_process_script())) with open(get_busco_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function download_lineage_data\n') script_file_id.write('{\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Downloading lineage data ..."\n') download_script = f'import requests; r = requests.get(\'{lineage_data_url}\') ; open(\'{lineage_data_file}\' , \'wb\').write(r.content)' script_file_id.write( f' $MINICONDA3_BIN_PATH/python3 -c "{download_script}"\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error download_script $RC; fi\n' ) script_file_id.write(f' tar -xzvf ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n') script_file_id.write(f' rm ./{lineage_data_file}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error rm $RC; fi\n') script_file_id.write(' echo "Lineage data are downloaded."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_busco_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_busco_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Assessing the transcriptome quality ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format(separator=False)}" \\\n' ) script_file_id.write(' busco \\\n') script_file_id.write(f' --cpu={ncpu} \\\n') script_file_id.write( f' --lineage_dataset=./{lineage_data} \\\n') script_file_id.write(f' --mode={mode} \\\n') script_file_id.write(f' --evalue={evalue} \\\n') script_file_id.write(f' --limit={limit} \\\n') if species.upper() != 'NONE': script_file_id.write(f' --species={species} \\\n') if long == 'YES': script_file_id.write(' --long \\\n') if augustus_options.upper() != 'NONE': script_file_id.write( f' --august_options="{augustus_options}" \\\n') script_file_id.write(f' --in={transcriptome_file} \\\n') script_file_id.write( f' --out={os.path.basename(current_run_dir)}\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi\n' ) script_file_id.write(' echo "The assessment is done."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_busco_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('download_lineage_data\n') script_file_id.write('run_busco_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_busco_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig' elif assembly_type == 'SCAFFOLDS': transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq' elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa' elif assembly_software == xlib.get_trinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta' elif assembly_software == xlib.get_ggtrinity_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta' elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta' elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta' # set the output file path if OK: output_file = f'{current_run_dir}/clustered-transcriptome.fasta' # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_cd_hit_est_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_cd_hit_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Filtering transcriptome ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format()}" \\\n') script_file_id.write(' cd-hit-est \\\n') script_file_id.write(f' -T {threads} \\\n') script_file_id.write(f' -M {memory_limit} \\\n') script_file_id.write(f' -i {transcriptome_file} \\\n') script_file_id.write( f' -c {seq_identity_threshold} \\\n') script_file_id.write(f' -n {word_length} \\\n') script_file_id.write(f' -mask {mask} \\\n') script_file_id.write(f' -match {match} \\\n') script_file_id.write(f' -mismatch {mismatch} \\\n') if other_parameters.upper() == 'NONE': script_file_id.write(f' -o {output_file}\n') else: script_file_id.write(f' -o {output_file} \\\n') parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: script_file_id.write( f' -{parameter_name} {parameter_value} \\\n' ) else: script_file_id.write( f' -{parameter_name} {parameter_value}\n' ) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): script_file_id.write( f' -{parameter_name} \\\n') else: script_file_id.write( f' -{parameter_name}\n') i += 1 script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi\n') script_file_id.write(' echo "The transcriptome is filtered."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_cd_hit_est_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('run_cd_hit_est_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_cd_hit_est_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def run_gmap_process(cluster_name, log, function=None): ''' Run a GMAP process. ''' # initialize the control variable OK = True # get the GMAP option dictionary gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) # get the experiment identification experiment_id = gmap_option_dict['identification']['experiment_id'] # warn that the log window must not be closed if not isinstance(log, xlib.DevStdOut): log.write('This process might take several minutes. Do not close this window, please wait!\n') # validate the GMAP config file log.write('{0}\n'.format(xlib.get_separator())) log.write('Validating the {0} config file ...\n'.format(xlib.get_gmap_name())) (OK, error_list) = validate_gmap_config_file(strict=True) if OK: log.write('The config file is OK.\n') else: log.write('*** ERROR: The config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') # create the SSH client connection if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Connecting the SSH client ...\n') (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master') if OK: log.write('The SSH client is connected.\n') else: for error in error_list: log.write('{0}\n'.format(error)) # create the SSH transport connection if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Connecting the SSH transport ...\n') (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name, 'master') if OK: log.write('The SSH transport is connected.\n') else: for error in error_list: log.write('{0}\n'.format(error)) # create the SFTP client if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Connecting the SFTP client ...\n') sftp_client = xssh.create_sftp_client(ssh_transport) log.write('The SFTP client is connected.\n') # warn that the requirements are being verified if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Verifying process requirements ...\n') # verify the master is running if OK: (master_state_code, master_state_name) = xec2.get_node_state(cluster_name, 'master') if master_state_code != 16: log.write('*** ERROR: The cluster {0} is not running. Its state is {1} ({2}).\n'.format(cluster_name, master_state_code, master_state_name)) OK = False # verify the GMAP-GSNAP is setup if OK: (OK, error_list, is_setup) = xbioinfoapp.is_setup_bioconda_package(xlib.get_gmap_gsnap_bioconda_code(), cluster_name, True, ssh_client) if OK: if not is_setup: log.write('*** ERROR: {0} is not setup.\n'.format(xlib.get_gmap_name())) OK = False else: log.write('*** ERROR: The verification of {0} setup could not be performed.\n'.format(xlib.get_gmap_name())) # warn that the requirements are OK if OK: log.write('Process requirements are OK.\n') # determine the run directory in the cluster if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Determining the run directory in the cluster ...\n') current_run_dir = xlib.get_cluster_current_run_dir(experiment_id, xlib.get_gmap_code()) command = 'mkdir --parents {0}'.format(current_run_dir) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The directory path is {0}.\n'.format(current_run_dir)) else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # build the GMAP process script if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Building the process script {0} ...\n'.format(get_gmap_process_script())) (OK, error_list) = build_gmap_process_script(cluster_name, current_run_dir) if OK: log.write('The file is built.\n') if not OK: log.write('*** ERROR: The file could not be built.\n') # upload the GMAP process script in the cluster if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Uploading the process script {0} in the directory {1} of the master ...\n'.format(get_gmap_process_script(), current_run_dir)) cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_script())) (OK, error_list) = xssh.put_file(sftp_client, get_gmap_process_script(), cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write('{0}\n'.format(error)) # set run permision to the GMAP process script in the cluster if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_script()))) command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_script())) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # build the GMAP process starter if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Building the process starter {0} ...\n'.format(get_gmap_process_starter())) (OK, error_list) = build_gmap_process_starter(current_run_dir) if OK: log.write('The file is built.\n') if not OK: log.write('***ERROR: The file could not be built.\n') # upload the GMAP process starter in the cluster if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Uploading the process starter {0} in the directory {1} of the master ...\n'.format(get_gmap_process_starter(), current_run_dir)) cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_starter())) (OK, error_list) = xssh.put_file(sftp_client, get_gmap_process_starter(), cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write('{0}\n'.format(error)) # set run permision to the GMAP process starter in the cluster if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_starter()))) command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_starter())) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # submit the GMAP process if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Submitting the process script {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_starter()))) sge_env = xcluster.get_sge_env() command = '{0}; qsub -V -b n -cwd {1}/{2}'.format(sge_env, current_run_dir, os.path.basename(get_gmap_process_starter())) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: for line in stdout: log.write('{0}\n'.format(line)) else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # close the SSH transport connection if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Closing the SSH transport connection ...\n') xssh.close_ssh_transport_connection(ssh_transport) log.write('The connection is closed.\n') # close the SSH client connection if OK: log.write('{0}\n'.format(xlib.get_separator())) log.write('Closing the SSH client connection ...\n') xssh.close_ssh_client_connection(ssh_client) log.write('The connection is closed.\n') # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write('{0}\n'.format(xlib.get_separator())) log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def build_infrastructure_software_installation_script(cluster_name): ''' Build the infrastructure software installation script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the connetion data (user_id, access_key_id, secret_access_key) = xconfiguration.get_basic_aws_data() # get the old region and user identification current_region_name = xconfiguration.get_current_region_name() # get the NGScloud config file ngscloud_config_file = xconfiguration.get_ngscloud_config_file() # get the option dictionary corresponding to the NGScloud config file ngscloud_options_dict = xlib.get_option_dict(ngscloud_config_file) # get the dataset structure and NGScloud_volume dataset_structure = ngscloud_options_dict['dataset info'][ 'dataset_structure'] # write the infrastructure software installation script try: if not os.path.exists( os.path.dirname( get_infrastructure_software_installation_script())): os.makedirs( os.path.dirname( get_infrastructure_software_installation_script())) with open(get_infrastructure_software_installation_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') if dataset_structure in [ xconfiguration.get_dataset_structure_singlevolume(), xconfiguration.get_dataset_structure_none() ]: script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function create_dataset_structure\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Creating the dataset structure ..."\n') script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_app_dir()}\n') script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_database_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_read_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_reference_dir()}\n' ) script_file_id.write( f' sudo mkdir --parents {xlib.get_cluster_result_dir()}\n' ) script_file_id.write( ' echo "The dataset structure is created."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_awscli\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Installing the AWS CLI ..."\n') script_file_id.write(f' unzip {xlib.get_awscli_name()}.zip\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then unzip $RC; fi\n') script_file_id.write(' sudo ./aws/install\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then install $RC; fi\n') script_file_id.write(' rm -rf aws\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then rm $RC; fi\n') script_file_id.write(f' rm {xlib.get_awscli_name()}.zip\n') script_file_id.write(' RC=$?\n') script_file_id.write(' if [ $RC -ne 0 ]; then rm $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function setup_aws\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Setting up AWS ..."\n') script_file_id.write(' UBUNTU_AWS_DIR=/home/ubuntu/.aws\n') script_file_id.write(' mkdir --parents $UBUNTU_AWS_DIR\n') script_file_id.write(f' CONFIG_FILE=$UBUNTU_AWS_DIR/config\n') script_file_id.write(' echo "[default]" > $CONFIG_FILE\n') script_file_id.write( f' echo "region = {current_region_name}" >> $CONFIG_FILE\n') script_file_id.write( ' CREDENTIALS_FILE=$UBUNTU_AWS_DIR/credentials\n') script_file_id.write(' echo "[default]" > $CREDENTIALS_FILE\n') script_file_id.write( f' echo "aws_access_key_id = {access_key_id}" >> $CREDENTIALS_FILE\n' ) script_file_id.write( f' echo "aws_secret_access_key = {secret_access_key}" >> $CREDENTIALS_FILE\n' ) script_file_id.write(' sudo echo "AWS is set up."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function fix_source_list\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Fixing file /etc/apt/sources.list ..."\n') script_file_id.write( ' sed -i "s/us-east-1.ec2.archive.ubuntu.com/old-releases.ubuntu.com/g" /etc/apt/sources.list\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n') script_file_id.write( ' sed -i "s/security.ubuntu.com/old-releases.ubuntu\.com/g" /etc/apt/sources.list\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n') script_file_id.write(' apt-get update\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The file is fixed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_xorg\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package xorg ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install xorg\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_libtbb2\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package libtbb2 ..."\n') script_file_id.write(' echo\n') script_file_id.write(' apt-get --assume-yes install libtbb2\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_libxt6\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package libxt6 ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install libxt6\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_parallel\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package parallel ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install parallel\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function install_texlive\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Installing the package texlive ..."\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-latex-base\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-fonts-recommended\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-fonts-extra\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write( ' sudo apt-get --assume-yes install texlive-latex-extra\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "The package is installed."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function uninstall_mysql\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Uninstalling MySQL ..."\n') script_file_id.write( ' sudo apt-get purge --auto-remove --assume-yes mysql-client mysql-client-5.5 mysql-client-core-5.5 mysql-common mysql-server mysql-server-5.5 mysql-server-core-5.5\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n') script_file_id.write(' echo "MySQL is uninstalled."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function create_swapfile\n') script_file_id.write('{\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Creating a file which will be used for swap ..."\n') script_file_id.write( ' sudo dd if=/dev/zero of=/swapfile bs=1024 count=2097152\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error dd $RC; fi\n') script_file_id.write(' sudo chmod 600 /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error chmod $RC; fi\n') script_file_id.write(' sudo mkswap /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error mkswap $RC; fi\n') script_file_id.write(' sudo swapon /swapfile\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error swapon $RC; fi\n') script_file_id.write( ' sudo echo "/swapfile swap swap defaults 0 0" >> /etc/fstab\n' ) script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error echo $RC; fi\n') script_file_id.write(' echo\n') script_file_id.write(' echo "The file is created."\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = 'Infrastructure software installation' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') if dataset_structure in [ xconfiguration.get_dataset_structure_singlevolume(), xconfiguration.get_dataset_structure_none() ]: script_file_id.write('create_dataset_structure\n') script_file_id.write('install_awscli\n') script_file_id.write('setup_aws\n') script_file_id.write('fix_source_list\n') script_file_id.write('install_xorg\n') script_file_id.write('install_libtbb2\n') script_file_id.write('install_libxt6\n') script_file_id.write('install_parallel\n') script_file_id.write('install_texlive\n') script_file_id.write('uninstall_mysql\n') # -- script_file_id.write( 'create_swapfile\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_infrastructure_software_installation_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def build_gmap_process_script(cluster_name, current_run_dir): ''' Build the current GMAP process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the GMAP option dictionary gmap_option_dict = xlib.get_option_dict(get_gmap_config_file()) # get the options experiment_id = gmap_option_dict['identification']['experiment_id'] reference_dataset_id = gmap_option_dict['identification']['reference_dataset_id'] reference_file = gmap_option_dict['identification']['reference_file'] assembly_software = gmap_option_dict['identification']['assembly_software'] assembly_dataset_id = gmap_option_dict['identification']['assembly_dataset_id'] assembly_type = gmap_option_dict['identification']['assembly_type'] threads = gmap_option_dict['GMAP parameters']['threads'] kmer = gmap_option_dict['GMAP parameters']['kmer'] sampling = gmap_option_dict['GMAP parameters']['sampling'] input_buffer_size = gmap_option_dict['GMAP parameters']['input-buffer-size'] output_buffer_size = gmap_option_dict['GMAP parameters']['output-buffer-size'] prunelevel = gmap_option_dict['GMAP parameters']['prunelevel'] format = gmap_option_dict['GMAP parameters']['format'] other_parameters = gmap_option_dict['GMAP parameters']['other_parameters'] # set the cluster reference dataset directory cluster_reference_dataset_dir = xlib.get_cluster_reference_dataset_dir(reference_dataset_id) # set the cluster reference file cluster_reference_file = xlib.get_cluster_reference_file(reference_dataset_id, reference_file) # set the GMAP database name reference_file_name, reference_file_extension = os.path.splitext(reference_file) gmap_database = '{0}-gmap_database'.format(reference_file_name) # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type.upper() == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type.upper() == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)) # set the output file path output_file = 'gmap_output_{0}.txt'.format(format.lower()) # get the GMAP process script name gmap_process_script = get_gmap_process_script() # write the GMAP process script try: if not os.path.exists(os.path.dirname(gmap_process_script)): os.makedirs(os.path.dirname(gmap_process_script)) with open(gmap_process_script, mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('GMAP_GSNAP_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$GMAP_GSNAP_PATH:$PATH')) file_id.write('{0}\n'.format('SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_gmap_gsnap_bioconda_code()))) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function build_gmap_database')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap_build \\')) file_id.write('{0}\n'.format(' --dir={0}\\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0}\\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) file_id.write('{0}\n'.format(' {0}'.format(cluster_reference_file))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function run_gmap_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' gmap --version')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format(' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\')) file_id.write('{0}\n'.format(' gmap \\')) file_id.write('{0}\n'.format(' --nthreads={0} \\'.format(threads))) file_id.write('{0}\n'.format(' --dir={0} \\'.format(cluster_reference_dataset_dir))) file_id.write('{0}\n'.format(' --db={0} \\'.format(gmap_database))) if kmer.upper() != 'NONE': file_id.write('{0}\n'.format(' --kmer={0} \\'.format(kmer))) if sampling.upper() != 'NONE': file_id.write('{0}\n'.format(' --sampling={0} \\'.format(sampling))) file_id.write('{0}\n'.format(' --input-buffer-size={0} \\'.format(input_buffer_size))) file_id.write('{0}\n'.format(' --output-buffer-size={0} \\'.format(output_buffer_size))) file_id.write('{0}\n'.format(' --prunelevel={0} \\'.format(prunelevel))) if format.upper() == 'COMPRESS': file_id.write('{0}\n'.format(' --compress \\')) elif format.upper() == 'SUMMARY': file_id.write('{0}\n'.format(' --summary \\')) elif format.upper() == 'ALIGN': file_id.write('{0}\n'.format(' --align \\')) else: file_id.write('{0}\n'.format(' --format={0} \\'.format(format.lower()))) file_id.write('{0}\n'.format(' --ordered \\')) file_id.write('{0}\n'.format(' --nofails \\')) if other_parameters.upper() != 'NONE': parameter_list = [x.strip() for x in other_parameters.split(';')] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() file_id.write('{0}\n'.format(' --{0}={1} \\'.format(parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() file_id.write('{0}\n'.format(' --{0} \\'.format(parameter_name))) file_id.write('{0}\n'.format(' {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format(' > {0}'.format(output_file))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format(' if [ $RC -ne 0 ]; then manage_error gmap $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format(' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`')) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format(' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format(xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format(' SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name()))) file_id.write('{0}\n'.format(' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name))) file_id.write('{0}\n'.format(' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"')) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write('{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format(' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------')) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('build_gmap_database')) file_id.write('{0}\n'.format('run_gmap_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format(gmap_process_script)) OK = False # return the control variable and the error list return (OK, error_list)
def validate_cd_hit_est_config_file(strict): ''' Validate the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif assembly_software not in [ xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code() ]: error_list.append( '*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.' .format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code() ) and not assembly_dataset_id.startswith(xlib.get_transabyss_code( )) and not assembly_dataset_id.startswith(xlib.get_trinity_code( )) and not assembly_dataset_id.startswith(xlib.get_star_code( )) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code( )) and not assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): error_list.append( '*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.' .format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith( xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append( '*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.' .format(xlib.get_soapdenovotrans_name())) OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code( )) or assembly_dataset_id.startswith(xlib.get_trinity_code( )) or assembly_dataset_id.startswith( xlib.get_star_code()) or assembly_dataset_id.startswith( xlib.get_cd_hit_est_code( )) or assembly_dataset_id.startswith( xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append( '*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.' .format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(threads) < 0: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(memory_limit) < 0: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False except: error_list.append( '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if float(seq_identity_threshold) < 0.0 or float( seq_identity_threshold) > 1.0: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False except: error_list.append( '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: if int(word_length) < 1: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False except: error_list.append( '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found).upper() if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(match) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: try: int(mismatch) except: error_list.append( '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False else: if other_parameters.upper() != 'NONE': parameter_list = [ x.strip() for x in other_parameters.split(';') ] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append( '*** ERROR: the value of the key "other_parameters" in the section "CD-HIT-EST parameters" must be NONE or a valid parameter list.' ) OK = False break if parameter_name in not_allowed_parameters_list: error_list.append( '*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "CD-HIT-EST parameters" because it is controled by {1}.' .format(parameter_name, xlib.get_project_name())) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append( '\nThe {0} config file is not valid. Please, correct this file or recreate it.' .format(xlib.get_cd_hit_est_name())) # return the control variable and the error list return (OK, error_list)
def check_database_transfer_config_file(strict): ''' Check the database transfer config file. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: database_transfer_options_dict = xlib.get_option_dict( get_database_transfer_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in database_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "database_dataset_id" database_dataset_id = database_transfer_options_dict.get( 'identification', {}).get('database_dataset_id', not_found) if database_dataset_id == not_found: error_list.append( '*** ERROR: the key "database_dataset_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "local_dir" local_dir = database_transfer_options_dict.get( 'identification', {}).get('local_dir', not_found) if local_dir == not_found: error_list.append( '*** ERROR: the key "local_dir" is not found in the section "identification".' ) OK = False else: if not os.path.isdir(local_dir): error_list.append( '*** ERROR: {0} is not a directory or does not exist.'. format(local_dir)) OK = False # check section "file-1" if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" for section in sections_list: if section not in ['identification']: # check than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append( f'*** ERROR: the section "{section}" has a wrong identification.' ) OK = False else: # check section "file-n" - key "file_name" file_name = database_transfer_options_dict.get( section, {}).get('file_name', not_found) if file_name == not_found: error_list.append( '*** ERROR: the key "file_name" is not found in the section "{0}".' .format(section)) OK = False else: if not os.path.isfile( os.path.join(local_dir, file_name)): error_list.append( '*** ERROR: the file {0} in the key "file_name" does not exist or is not accessible in the local directory {1}.' .format(file_name, local_dir)) OK = False # warn that the database config file is not valid if there are any errors if not OK: error_list.append( '\nThe database transfer config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def build_cd_hit_est_process_script(cluster_name, current_run_dir): ''' Build the current CD-HIT-EST process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the option dictionary cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file()) # get the options experiment_id = cd_hit_est_option_dict['identification']['experiment_id'] assembly_software = cd_hit_est_option_dict['identification'][ 'assembly_software'] assembly_dataset_id = cd_hit_est_option_dict['identification'][ 'assembly_dataset_id'] assembly_type = cd_hit_est_option_dict['identification']['assembly_type'] threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads'] memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'memory_limit'] seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'seq_identity_threshold'] word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'word_length'] mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask'] match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match'] mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch'] other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][ 'other_parameters'] # set the transcriptome file path if assembly_software == xlib.get_soapdenovotrans_code(): if assembly_type == 'CONTIGS': transcriptome_file = '{0}/{1}-{2}.contig'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_type == 'SCAFFOLDS': transcriptome_file = '{0}/{1}-{2}.scafSeq'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id) elif assembly_software == xlib.get_transabyss_code(): transcriptome_file = '{0}/transabyss-final.fa'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_trinity_code(): transcriptome_file = '{0}/Trinity.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_star_code(): transcriptome_file = '{0}/Trinity-GG.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_cd_hit_est_code(): transcriptome_file = '{0}/clustered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) elif assembly_software == xlib.get_transcript_filter_code(): transcriptome_file = '{0}/filtered-transcriptome.fasta'.format( xlib.get_cluster_experiment_result_dataset_dir( experiment_id, assembly_dataset_id)) # set the output file path if OK: output_file = '{0}/clustered-transcriptome.fasta'.format( current_run_dir) # write the CD-HIT-EST process script try: if not os.path.exists(os.path.dirname( get_cd_hit_est_process_script())): os.makedirs(os.path.dirname(get_cd_hit_est_process_script())) with open(get_cd_hit_est_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'CDHIT_PATH={0}/{1}/envs/{2}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$CDHIT_PATH:$PATH')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format( xlib.get_cd_hit_bioconda_code()))) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_cd_hit_est_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Running {0} process ..."'.format( xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format(' cd-hit-est \\')) file_id.write('{0}\n'.format( ' -T {0} \\'.format(threads))) file_id.write('{0}\n'.format( ' -M {0} \\'.format(memory_limit))) file_id.write('{0}\n'.format( ' -i {0} \\'.format(transcriptome_file))) file_id.write('{0}\n'.format( ' -c {0} \\'.format(seq_identity_threshold))) file_id.write('{0}\n'.format( ' -n {0} \\'.format(word_length))) file_id.write('{0}\n'.format( ' -mask {0} \\'.format(mask))) file_id.write('{0}\n'.format( ' -match {0} \\'.format(match))) file_id.write('{0}\n'.format( ' -mismatch {0} \\'.format(mismatch))) if other_parameters.upper() == 'NONE': file_id.write('{0}\n'.format( ' -o {0}'.format(output_file))) else: file_id.write('{0}\n'.format( ' -o {0} \\'.format(output_file))) parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() if i < len(parameter_list) - 1: file_id.write('{0}\n'.format( ' -{0} {1} \\'.format( parameter_name, parameter_value))) else: file_id.write('{0}\n'.format( ' -{0} {1}'.format( parameter_name, parameter_value))) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() if i < len(parameter_list): file_id.write('{0}\n'.format( ' -{0} \\'.format(parameter_name))) else: file_id.write('{0}\n'.format( ' -{0}'.format(parameter_name))) i += 1 file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_cd_hit_est_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_rsem_eval_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_cd_hit_est_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_cd_hit_est_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def check_gzip_config_file(dataset_type, strict): ''' Check the gzip config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type)) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The option dictionary could not be built from the config file') OK = False else: # get the sections list sections_list = [] for section in gzip_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = gzip_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False elif dataset_type == 'reference' and experiment_id.upper() != 'NONE': error_list.append('*** ERROR: the key "experiment_id" has to be always NONE') OK = False # check section "identification" - key "dataset_type" dataset_type_2 = gzip_option_dict.get('identification', {}).get('dataset_type', not_found) if dataset_type_2 == not_found: error_list.append('*** ERROR: the key "dataset_type" is not found in the section "identification".') OK = False else: if dataset_type in ['reference', 'read']: if dataset_type_2.lower() != dataset_type: error_list.append('*** ERROR: the key "dataset_type" has to be {0}.'.format(dataset_type)) OK = False elif dataset_type == 'result': if dataset_type_2.lower() not in ['result', 'whole-result']: error_list.append('*** ERROR: the key "dataset_type" has to be result or whole-result.') OK = False # check section "identification" - key "dataset_id" dataset_id = gzip_option_dict.get('identification', {}).get('dataset_id', not_found) if dataset_id == not_found: error_list.append('*** ERROR: the key "dataset_id" is not found in the section "identification".') OK = False # check section "gzip parameters" if 'gzip parameters' not in sections_list: error_list.append('*** ERROR: the section "gzip parameters" is not found.') OK = False else: # check section "gzip parameters" - key "action" action = gzip_option_dict.get('gzip parameters', {}).get('action', not_found) if action == not_found: error_list.append('*** ERROR: the key "action" is not found in the section "gzip parameters".') OK = False else: if action.lower() not in ['compress', 'decompress']: error_list.append('*** ERROR: the key "action" has to be compress or decompress.') OK = False # check section "file-1" if dataset_type_2.lower() in ['reference', 'database', 'read', 'result']: if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" if dataset_type_2.lower() in ['reference', 'database', 'read', 'result']: for section in sections_list: if section not in ['identification', 'gzip parameters']: # check than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.') OK = False else: # check section "file-n" - key "dataset_subdirectory" dataset_subdirectory = gzip_option_dict.get(section, {}).get('dataset_subdirectory', not_found) if dataset_subdirectory == not_found: error_list.append('*** ERROR: the key "dataset_subdirectory" is not found in the section "{0}".'.format(section)) OK = False elif not xlib.is_valid_path(dataset_subdirectory, 'linux'): error_list.append('*** ERROR: the file {0} in the key "dataset_subdirectory" of the section "{1}" has a non valid file name.'.format(dataset_subdirectory, section)) OK = False # check section "file-n" - key "file_name" file_name = gzip_option_dict.get(section, {}).get('file_name', not_found) if file_name == not_found: error_list.append('*** ERROR: the key "file_name" is not found in the section "{0}".'.format(section)) OK = False elif not xlib.is_valid_path(file_name, 'linux'): error_list.append('*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.'.format(file_name, section)) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gzip_name())) # return the control variable and the error list return (OK, error_list)
def validate_fastqc_config_file(strict): ''' Validate the FastQC config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in fastqc_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = fastqc_option_dict.get('identification', {}).get( 'experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "read_dataset_id" read_dataset_id = fastqc_option_dict.get('identification', {}).get( 'read_dataset_id', not_found) if read_dataset_id == not_found: error_list.append( '*** ERROR: the key "read_dataset_id" is not found in the section "identification".' ) OK = False # check section "FastQC parameters" if 'FastQC parameters' not in sections_list: error_list.append( '*** ERROR: the section "FastQC parameters" is not found.') OK = False else: # check section "FastQC parameters" - key "threads" threads = fastqc_option_dict.get('FastQC parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "FastQC parameters".' ) OK = False else: try: if int(threads) < 1: error_list.append( '*** ERROR: the key "threads" in the section "FastQC parameters" must be an integer value greater or equal to 1.' ) OK = False except: error_list.append( '*** ERROR: the key "threads" in the section "FastQC parameters" must be an integer value greater or equal to 1.' ) OK = False # check section "file-1" if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" for section in sections_list: if section not in ['identification', 'FastQC parameters']: # verify than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append( '*** ERROR: the section "{0}" has a wrong identification.' .format(section)) OK = False else: # check section "file-n" - key "file_name" file_name = fastqc_option_dict.get(section, {}).get( 'file_name', not_found) if file_name == not_found: error_list.append( '*** ERROR: the key "file_name" is not found in the section "{0}".' .format(section)) OK = False elif not xlib.is_valid_path(file_name, 'linux'): error_list.append( '*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.' .format(file_name, section)) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append( '\nThe {0} config file is not valid. Please, correct this file or recreate it.' .format(xlib.get_fastqc_name())) # return the control variable and the error list return (OK, error_list)
def upload_read_dataset(cluster_name, log, function=None): ''' Upload the read dataset to the cluster. ''' # initialize the control variable OK = True # get the read transfer config file read_transfer_config_file = get_read_transfer_config_file() # warn that the log window must not be closed if not isinstance(log, xlib.DevStdOut): log.write( 'This process might take several minutes. Do not close this window, please wait!\n' ) # get and validate the read transfer config file log.write('{0}\n'.format(xlib.get_separator())) log.write('The read transfer config file is been validating ...\n') if validate_read_transfer_config_file(strict=True): log.write('The config file is OK.\n') else: log.write('*** ERROR: The read transfer config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') OK = False # create the SSH client connection if OK: (OK, error_list, ssh_client) = xssh.create_ssh_client_connection( cluster_name, 'master') for error in error_list: log.write('{0}\n'.format(error)) # create the SSH transport connection if OK: (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection( cluster_name, 'master') for error in error_list: log.write('{0}\n'.format(error)) # create the SFTP client if OK: sftp_client = xssh.create_sftp_client(ssh_transport) # get the options dictionary if OK: read_transfer_options_dict = xlib.get_option_dict( read_transfer_config_file) # get the experiment identification and create the experiment reads directory if OK: # get the experiment identification experiment_id = read_transfer_options_dict['identification'][ 'experiment_id'] # Get the directory of read and results datasets of the experiment cluster_experiment_reads_dir = xlib.get_cluster_experiment_read_dataset_dir( experiment_id, xlib.get_uploaded_read_dataset_name()) cluster_experiment_result_dir = xlib.get_cluster_experiment_result_dir( experiment_id) # create the experiment reads directory log.write('{0}\n'.format(xlib.get_separator())) log.write( 'The reads directory {0} in the cluster is being created ...\n'. format(cluster_experiment_reads_dir)) command = 'mkdir --parents {0}'.format(cluster_experiment_reads_dir) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The directory is created.\n') else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # create the experiment run result directory log.write('{0}\n'.format(xlib.get_separator())) log.write( 'The run result directory {0} in the cluster is being created ...\n' .format(cluster_experiment_result_dir)) command = 'mkdir --parents {0}'.format(cluster_experiment_result_dir) (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The directory is created.\n') else: log.write('*** ERROR: Wrong command ---> {0}\n'.format(command)) # upload the read dataset if OK: # get the sections list sections_list = [] for section in read_transfer_options_dict.keys(): sections_list.append(section) sections_list.sort() # for each section "file-n" for section in sections_list: # verify than the section identification is like file-n if re.match('^file-[0-9]+$', section): # get local path and cluster directory local_path = read_transfer_options_dict[section]['local_path'] # upload the reference file in the cluster log.write('{0}\n'.format(xlib.get_separator())) log.write('The file {0} is being uploaded to {1} ...\n'.format( local_path, cluster_experiment_reads_dir)) cluster_path = '{0}/{1}'.format(cluster_experiment_reads_dir, os.path.basename(local_path)) (OK, error_list) = xssh.put_file(sftp_client, local_path, cluster_path) if OK: log.write('The file has been uploaded.\n') else: for error in error_list: log.write('{0}\n'.format(error)) break # close the SSH transport connection if OK: xssh.close_ssh_transport_connection(ssh_transport) # close the SSH client connection if OK: xssh.close_ssh_client_connection(ssh_client) # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write('{0}\n'.format(xlib.get_separator())) log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def build_fastqc_process_script(cluster_name, current_run_dir): ''' Build the current FastQC process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the FastQC option dictionary fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file()) # get the options experiment_id = fastqc_option_dict['identification']['experiment_id'] read_dataset_id = fastqc_option_dict['identification']['read_dataset_id'] threads = fastqc_option_dict['FastQC parameters']['threads'] # get the sections list sections_list = [] for section in fastqc_option_dict.keys(): sections_list.append(section) sections_list.sort() # build the file name list file_name_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^file-[0-9]+$', section): file_name = fastqc_option_dict[section]['file_name'] file_name_list.append(file_name) # write the FastQC process script try: if not os.path.exists(os.path.dirname(get_fastqc_process_script())): os.makedirs(os.path.dirname(get_fastqc_process_script())) with open(get_fastqc_process_script(), mode='w', encoding='utf8', newline='\n') as file_id: file_id.write('{0}\n'.format('#!/bin/bash')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format( 'FASTQC_PATH={0}/{1}/envs/{2}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_fastqc_bioconda_code()))) file_id.write('{0}\n'.format('PATH=$FASTQC_PATH:$PATH')) file_id.write('{0}\n'.format( 'SEP="#########################################"')) file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format( xlib.get_cluster_app_dir(), xlib.get_miniconda3_name()))) file_id.write('{0}\n'.format('source activate {0}'.format( xlib.get_fastqc_bioconda_code()))) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function init')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' INIT_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."' .format(cluster_name))) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function run_fastqc_process')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' cd {0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' fastqc --version')) for file_name in file_name_list: file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' /usr/bin/time \\')) file_id.write('{0}\n'.format( ' --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\' )) file_id.write('{0}\n'.format(' fastqc \\')) file_id.write('{0}\n'.format(' {0} \\'.format( xlib.get_cluster_read_file(experiment_id, read_dataset_id, file_name)))) file_id.write('{0}\n'.format( ' --threads={0} \\'.format(threads))) file_id.write('{0}\n'.format( ' --outdir={0}'.format(current_run_dir))) file_id.write('{0}\n'.format(' RC=$?')) file_id.write('{0}\n'.format( ' if [ $RC -ne 0 ]; then manage_error fastqc $RC; fi')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function end')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format( ' echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_fastqc_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_fastqc_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 0')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function manage_error')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format(' END_DATETIME=`date --utc +%s`')) file_id.write('{0}\n'.format( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`' )) file_id.write('{0}\n'.format(' calculate_duration')) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write( '{0}\n'.format(' echo "ERROR: $1 returned error $2"')) file_id.write('{0}\n'.format( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."' )) file_id.write('{0}\n'.format(' echo "$SEP"')) file_id.write('{0}\n'.format(' RECIPIENT={0}'.format( xconfiguration.get_contact_data()))) file_id.write('{0}\n'.format( ' SUBJECT="{0}: {1} process"'.format( xlib.get_project_name(), xlib.get_fastqc_name()))) file_id.write('{0}\n'.format( ' MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"' .format(xlib.get_fastqc_name(), cluster_name))) file_id.write('{0}\n'.format( ' mail --append "Content-type: text/html;" --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"' )) file_id.write('{0}\n'.format(' exit 3')) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('function calculate_duration')) file_id.write('{0}\n'.format('{')) file_id.write('{0}\n'.format( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`')) file_id.write('{0}\n'.format(' HH=`expr $DURATION / 3600`')) file_id.write( '{0}\n'.format(' MM=`expr $DURATION % 3600 / 60`')) file_id.write('{0}\n'.format(' SS=`expr $DURATION % 60`')) file_id.write('{0}\n'.format( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`' )) file_id.write('{0}\n'.format('}')) file_id.write('{0}\n'.format( '#-------------------------------------------------------------------------------' )) file_id.write('{0}\n'.format('init')) file_id.write('{0}\n'.format('run_fastqc_process')) file_id.write('{0}\n'.format('end')) except: error_list.append('*** ERROR: The file {0} can not be created'.format( get_fastqc_process_script())) OK = False # return the control variable and the error list return (OK, error_list)
def run_htseq_count_process(cluster_name, log, function=None): ''' Run a htseq-count process. ''' # initialize the control variable OK = True # get the htseq-count option dictionary htseq_count_option_dict = xlib.get_option_dict( get_htseq_count_config_file()) # get the experiment identification experiment_id = htseq_count_option_dict['identification']['experiment_id'] # warn that the log window does not have to be closed if not isinstance(log, xlib.DevStdOut): log.write( 'This process might take several minutes. Do not close this window, please wait!\n' ) # check the htseq-count config file log.write(f'{xlib.get_separator()}\n') log.write(f'Checking the {xlib.get_htseq_count_name()} config file ...\n') (OK, error_list) = check_htseq_count_config_file(strict=True) if OK: log.write('The file is OK.\n') else: log.write('*** ERROR: The config file is not valid.\n') log.write('Please correct this file or recreate the config files.\n') # create the SSH client connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SSH client ...\n') (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name) if OK: log.write('The SSH client is connected.\n') else: for error in error_list: log.write(f'{error}\n') # create the SSH transport connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SSH transport ...\n') (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name) if OK: log.write('The SSH transport is connected.\n') else: for error in error_list: log.write(f'{error}\n') # create the SFTP client if OK: log.write(f'{xlib.get_separator()}\n') log.write('Connecting the SFTP client ...\n') sftp_client = xssh.create_sftp_client(ssh_transport) log.write('The SFTP client is connected.\n') # warn that the requirements are being verified if OK: log.write(f'{xlib.get_separator()}\n') log.write('Checking process requirements ...\n') # check the master is running if OK: (master_state_code, master_state_name) = xec2.get_node_state(cluster_name) if master_state_code != 16: log.write( f'*** ERROR: The cluster {cluster_name} is not running. Its state is {master_state_code} ({master_state_name}).\n' ) OK = False # check HTSeq is installed if OK: (OK, error_list, is_installed) = xbioinfoapp.is_installed_anaconda_package( xlib.get_htseq_anaconda_code(), cluster_name, True, ssh_client) if OK: if not is_installed: log.write( f'*** ERROR: {xlib.get_htseq_name()} is not installed.\n') OK = False else: log.write( f'*** ERROR: The verification of {xlib.get_htseq_name()} installation could not be performed.\n' ) # warn that the requirements are OK if OK: log.write('Process requirements are OK.\n') # determine the run directory in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write('Determining the run directory in the cluster ...\n') current_run_dir = xlib.get_cluster_current_run_dir( experiment_id, xlib.get_htseq_count_code()) command = f'mkdir --parents {current_run_dir}' (OK, _, _) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write(f'The directory path is {current_run_dir}.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # build the htseq-count process script if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Building the process script {get_htseq_count_process_script()} ...\n' ) (OK, error_list) = build_htseq_count_process_script( cluster_name, current_run_dir) if OK: log.write('The file is built.\n') if not OK: log.write('*** ERROR: The file could not be built.\n') # upload the htseq-count process script to the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Uploading the process script {get_htseq_count_process_script()} to the directory {current_run_dir} ...\n' ) cluster_path = f'{current_run_dir}/{os.path.basename(get_htseq_count_process_script())}' (OK, error_list) = xssh.put_file(sftp_client, get_htseq_count_process_script(), cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write(f'{error}\n') # set run permision to the htseq-count process script in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Setting on the run permision of {current_run_dir}/{os.path.basename(get_htseq_count_process_script())} ...\n' ) command = f'chmod u+x {current_run_dir}/{os.path.basename(get_htseq_count_process_script())}' (OK, _, _) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # build the htseq-count process starter if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Building the process starter {get_htseq_count_process_starter()} ...\n' ) (OK, error_list) = build_htseq_count_process_starter(current_run_dir) if OK: log.write('The file is built.\n') if not OK: log.write('***ERROR: The file could not be built.\n') # upload the htseq-count process starter to the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Uploading the process starter {get_htseq_count_process_starter()} to the directory {current_run_dir} ...\n' ) cluster_path = f'{current_run_dir}/{os.path.basename(get_htseq_count_process_starter())}' (OK, error_list) = xssh.put_file(sftp_client, get_htseq_count_process_starter(), cluster_path) if OK: log.write('The file is uploaded.\n') else: for error in error_list: log.write(f'{error}\n') # set run permision to the htseq-count process starter in the cluster if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Setting on the run permision of {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())} ...\n' ) command = f'chmod u+x {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())}' (OK, _, _) = xssh.execute_cluster_command(ssh_client, command) if OK: log.write('The run permision is set.\n') else: log.write(f'*** ERROR: Wrong command ---> {command}\n') # submit the htseq-count process if OK: log.write(f'{xlib.get_separator()}\n') log.write( f'Submitting the process script {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())} ...\n' ) OK = xssh.submit_script( cluster_name, ssh_client, current_run_dir, os.path.basename(get_htseq_count_process_starter()), log) # close the SSH transport connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Closing the SSH transport connection ...\n') xssh.close_ssh_transport_connection(ssh_transport) log.write('The connection is closed.\n') # close the SSH client connection if OK: log.write(f'{xlib.get_separator()}\n') log.write('Closing the SSH client connection ...\n') xssh.close_ssh_client_connection(ssh_client) log.write('The connection is closed.\n') # warn that the log window can be closed if not isinstance(log, xlib.DevStdOut): log.write(f'{xlib.get_separator()}\n') log.write('You can close this window now.\n') # execute final function if function is not None: function() # return the control variable return OK
def check_busco_config_file(strict): ''' Check the BUSCO config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: busco_option_dict = xlib.get_option_dict(get_busco_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in busco_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = busco_option_dict.get('identification', {}).get( 'experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = busco_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = busco_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = busco_option_dict.get('identification', {}).get( 'assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "BUSCO parameters" if 'BUSCO parameters' not in sections_list: error_list.append( '*** ERROR: the section "BUSCO parameters" is not found.') OK = False else: # check section "BUSCO parameters" - key "ncpu" ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found) if ncpu == not_found: error_list.append( '*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(ncpu, minimum=1): error_list.append( '*** ERROR: the key "ncpu" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "lineage_data_url" lineage_data_url = busco_option_dict.get( 'BUSCO parameters', {}).get('lineage_data_url', not_found) if lineage_data_url == not_found: error_list.append( '*** ERROR: the key "lineage_data_url" is not found in the section "BUSCO parameters"' ) OK = False else: try: urllib.request.urlopen(lineage_data_url) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: the key "lineage_data_url" has to be a reachable address.' ) OK = False # check section "BUSCO parameters" - key "mode" mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found) if mode == not_found: error_list.append( '*** ERROR: the key "mode" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( mode, get_mode_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "evalue" evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found) if evalue == not_found: error_list.append( '*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_float(evalue, minimum=0., mne=1E-12): error_list.append( '*** ERROR: the key "evalue" has to be a float number greater than 0.' ) OK = False # check section "BUSCO parameters" - key "limit" limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found) if limit == not_found: error_list.append( '*** ERROR: the key "limit" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(limit, minimum=1): error_list.append( '*** ERROR: the key "limit" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "species" species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found) if species == not_found: error_list.append( '*** ERROR: the key "species" is not found in the section "BUSCO parameters"' ) OK = False # check section "BUSCO parameters" - key "long" long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found) if long == not_found: error_list.append( '*** ERROR: the key "long" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( long, get_long_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "long" has to be {get_long_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "augustus_options" augustus_options = busco_option_dict.get( 'BUSCO parameters', {}).get('augustus_options', not_found) if augustus_options == not_found: error_list.append( '*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".' ) OK = False elif augustus_options.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( augustus_options, "augustus_options", []) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_busco_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def build_htseq_count_process_script(cluster_name, current_run_dir): ''' Build the current htseq-count process script. ''' # initialize the control variable and the error list OK = True error_list = [] # get the htseq-count option dictionary htseq_count_option_dict = xlib.get_option_dict( get_htseq_count_config_file()) # get the options experiment_id = htseq_count_option_dict['identification']['experiment_id'] reference_dataset_id = htseq_count_option_dict['identification'][ 'reference_dataset_id'] annotation_file = htseq_count_option_dict['identification'][ 'annotation_file'] nprocesses = htseq_count_option_dict['htseq-count parameters'][ 'nprocesses'] stranded = htseq_count_option_dict['htseq-count parameters']['stranded'] minaqual = htseq_count_option_dict['htseq-count parameters']['minaqual'] type = htseq_count_option_dict['htseq-count parameters']['type'] idattr = htseq_count_option_dict['htseq-count parameters']['idattr'] mode = htseq_count_option_dict['htseq-count parameters']['mode'] nonunique = htseq_count_option_dict['htseq-count parameters']['nonunique'] other_parameters = htseq_count_option_dict['htseq-count parameters'][ 'other_parameters'] # get the sections list sections_list = [] for section in htseq_count_option_dict.keys(): sections_list.append(section) sections_list.sort() # build alignment dataset identification list alignment_software_list = [] alignment_dataset_id_list = [] for section in sections_list: # if the section identification is like library-n if re.match('^alignment-dataset-[0-9]+$', section): alignment_software_list.append( htseq_count_option_dict[section]['alignment_software']) alignment_dataset_id_list.append( htseq_count_option_dict[section]['alignment_dataset_id']) # set the annotation file path annotation_file = xlib.get_cluster_reference_file(reference_dataset_id, annotation_file) # write the htseq-count process script try: if not os.path.exists(os.path.dirname( get_htseq_count_process_script())): os.makedirs(os.path.dirname(get_htseq_count_process_script())) with open(get_htseq_count_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id: script_file_id.write('#!/bin/bash\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( 'SEP="#########################################"\n') script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n') script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n' ) script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n') script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n' ) script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write( f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n') script_file_id.write( f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n' ) script_file_id.write('mkdir --parents $STATUS_DIR\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n') script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n' ) script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write(f'CURRENT_DIR={current_run_dir}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function init\n') script_file_id.write('{\n') script_file_id.write(' INIT_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(f' echo "CLUSTER: {cluster_name}"\n') script_file_id.write(' echo "HOST NAME: $HOSTNAME"\n') script_file_id.write(' echo "HOST IP: $HOST_IP"\n') script_file_id.write(' echo "HOST ADDRESS: $HOST_ADDRESS"\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function print_htseq_count_version\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_htseq_anaconda_code()}\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' # -- htseq-count --version\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function run_htseq_count_process\n') script_file_id.write('{\n') script_file_id.write( f' source activate {xlib.get_htseq_anaconda_code()}\n') script_file_id.write(' cd $CURRENT_DIR\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "Counting reads ..."\n') script_file_id.write(' /usr/bin/time \\\n') script_file_id.write( f' --format="{xlib.get_time_output_format(separator=False)}" \\\n' ) script_file_id.write(' htseq-count \\\n') script_file_id.write(f' --nprocesses={nprocesses} \\\n') script_file_id.write(' --format=bam \\\n') script_file_id.write( f' --stranded={stranded.lower()} \\\n') script_file_id.write(f' --minaqual={minaqual} \\\n') script_file_id.write(f' --type={type} \\\n') script_file_id.write(f' --idattr={idattr} \\\n') script_file_id.write(f' --mode={mode.lower()} \\\n') script_file_id.write( f' --nonunique={nonunique.lower()} \\\n') script_file_id.write(' --quiet \\\n') if other_parameters.upper() != 'NONE': parameter_list = [ x.strip() for x in other_parameters.split(';') ] for i in range(len(parameter_list)): if parameter_list[i].find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() script_file_id.write( f' --{parameter_name}={parameter_value} \\\n' ) else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter_list[i]) parameter_name = mo.group(1).strip() script_file_id.write( f' --{parameter_name} \\\n') for i in range(len(alignment_dataset_id_list)): alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam' script_file_id.write(f' {alignment_files} \\\n') script_file_id.write(f' {annotation_file} \\\n') script_file_id.write(f' > read-count.txt\n') script_file_id.write(' RC=$?\n') script_file_id.write( ' if [ $RC -ne 0 ]; then manage_error htseq-count $RC; fi\n' ) script_file_id.write(' echo "Reads are counted."\n') script_file_id.write(' conda deactivate\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function end\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write( ' echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail ok\n') script_file_id.write(' touch $SCRIPT_STATUS_OK\n') script_file_id.write(' exit 0\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function manage_error\n') script_file_id.write('{\n') script_file_id.write(' END_DATETIME=`date --utc +%s`\n') script_file_id.write( ' FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n' ) script_file_id.write(' calculate_duration\n') script_file_id.write(' echo "$SEP"\n') script_file_id.write(' echo "ERROR: $1 returned error $2"\n') script_file_id.write( ' echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n' ) script_file_id.write(' echo "$SEP"\n') script_file_id.write(' send_mail wrong\n') script_file_id.write(' touch $SCRIPT_STATUS_WRONG\n') script_file_id.write(' exit 3\n') script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) process_name = f'{xlib.get_htseq_count_name()} process' mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name) mail_message_wrong = xlib.get_mail_message_wrong( process_name, cluster_name) script_file_id.write('function send_mail\n') script_file_id.write('{\n') script_file_id.write( f' SUBJECT="{xlib.get_project_name()}: {process_name}"\n') script_file_id.write(' if [ "$1" == "ok" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_ok}"\n') script_file_id.write(' elif [ "$1" == "wrong" ]; then\n') script_file_id.write(f' MESSAGE="{mail_message_wrong}"\n') script_file_id.write(' else\n') script_file_id.write(' MESSAGE=""\n') script_file_id.write(' fi\n') script_file_id.write( ' DESTINATION_FILE=mail-destination.json\n') script_file_id.write(' echo "{" > $DESTINATION_FILE\n') script_file_id.write( f' echo " \\\"ToAddresses\\\": [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"CcAddresses\\\": []," >> $DESTINATION_FILE\n' ) script_file_id.write( ' echo " \\\"BccAddresses\\\": []" >> $DESTINATION_FILE\n' ) script_file_id.write(' echo "}" >> $DESTINATION_FILE\n') script_file_id.write(' MESSAGE_FILE=mail-message.json\n') script_file_id.write(' echo "{" > $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Subject\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }," >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Body\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Html\\\": {" >> $MESSAGE_FILE\n') script_file_id.write( ' echo " \\\"Data\\\": \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n' ) script_file_id.write( ' echo " \\\"Charset\\\": \\\"UTF-8\\\"" >> $MESSAGE_FILE\n' ) script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo " }" >> $MESSAGE_FILE\n') script_file_id.write(' echo "}" >> $MESSAGE_FILE\n') script_file_id.write( f' aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('function calculate_duration\n') script_file_id.write('{\n') script_file_id.write( ' DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n') script_file_id.write(' HH=`expr $DURATION / 3600`\n') script_file_id.write(' MM=`expr $DURATION % 3600 / 60`\n') script_file_id.write(' SS=`expr $DURATION % 60`\n') script_file_id.write( ' FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n' ) script_file_id.write('}\n') script_file_id.write( '#-------------------------------------------------------------------------------\n' ) script_file_id.write('init\n') script_file_id.write('print_htseq_count_version\n') script_file_id.write('run_htseq_count_process\n') script_file_id.write('end\n') except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( f'*** ERROR: The file {get_htseq_count_process_script()} can not be created' ) OK = False # return the control variable and the error list return (OK, error_list)
def validate_busco_config_file(strict): ''' Validate the BUSCO config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: busco_option_dict = xlib.get_option_dict(get_busco_config_file()) except: error_list.append('*** ERROR: The syntax is WRONG.') OK = False else: # get the sections list sections_list = [] for section in busco_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = busco_option_dict.get('identification', {}).get('experiment_id', not_found) is_experiment_id_OK = True if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') is_experiment_id_OK = False OK = False # check section "identification" - key "assembly_software" assembly_software = busco_option_dict.get('identification', {}).get('assembly_software', not_found) is_assembly_software_OK = True if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') is_assembly_software_OK = False OK = False elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]: error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code())) is_assembly_software_OK = False OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = busco_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) is_assembly_dataset_id_OK = True if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') is_assembly_dataset_id_OK = False OK = False elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code())) is_assembly_dataset_id_OK = False OK = False # check section "identification" - key "assembly_type" assembly_type = busco_option_dict.get('identification', {}).get('assembly_type', not_found) is_assembly_type_OK = True if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()): if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']: error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name())) is_assembly_type_OK = False OK = False elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()): if assembly_type.upper() != 'NONE': error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())) is_assembly_type_OK = False OK = False # check section "BUSCO parameters" if 'BUSCO parameters' not in sections_list: error_list.append('*** ERROR: the section "BUSCO parameters" is not found.') OK = False else: # check section "BUSCO parameters" - key "ncpu" ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found) is_ncpu_OK = True if ncpu == not_found: error_list.append('*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".') is_ncpu_OK = False OK = False else: try: if int(ncpu) < 1: error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_ncpu_OK = False OK = False except: error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_ncpu_OK = False OK = False # check section "BUSCO parameters" - key "lineage_data" lineage_data = busco_option_dict.get('BUSCO parameters', {}).get('lineage_data', not_found) is_lineage_data_OK = True if lineage_data == not_found: error_list.append('*** ERROR: the key "lineage_data" is not found in the section "BUSCO parameters"') is_lineage_data_OK = False OK = False # check section "BUSCO parameters" - key "mode" mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found).lower() is_mode_OK = True if mode == not_found: error_list.append('*** ERROR: the key "mode" is not found in the section "BUSCO parameters".') is_mode_OK = False OK = False elif mode not in ['geno', 'tran', 'prot']: error_list.append('*** ERROR: the key "mode" value in the section "BUSCO parameters" must be geno or tran or prot.') is_mode_OK = False OK = False # check section "BUSCO parameters" - key "evalue" evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found) is_evalue_OK = True if evalue == not_found: error_list.append('*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".') is_evalue_OK = False OK = False else: try: if float(evalue) <= 0: error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.') is_evalue_OK = False OK = False except: error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.') is_evalue_OK = False OK = False # check section "BUSCO parameters" - key "limit" limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found) is_limit_OK = True if limit == not_found: error_list.append('*** ERROR: the key "limit" is not found in the section "BUSCO parameters".') OK = False else: try: if int(limit) < 1: error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_limit_OK = False OK = False except: error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.') is_limit_OK = False OK = False # check section "BUSCO parameters" - key "species" species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found) is_species_OK = True if species == not_found: error_list.append('*** ERROR: the key "species" is not found in the section "BUSCO parameters"') is_species_OK = False OK = False # check section "BUSCO parameters" - key "long" long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found).upper() is_long_OK = True if long == not_found: error_list.append('*** ERROR: the key "long" is not found in the section "BUSCO parameters".') is_long_OK = False OK = False elif long not in ['YES', 'NO']: error_list.append('*** ERROR: the key "long" value in the section "BUSCO parameters" must be YES or NO.') is_long_OK = False OK = False # check section "BUSCO parameters" - key "augustus_options" augustus_options = busco_option_dict.get('BUSCO parameters', {}).get('augustus_options', not_found) is_augustus_options_OK = True if augustus_options == not_found: error_list.append('*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".') is_augustus_options_OK = False OK = False else: if augustus_options.upper() != 'NONE': parameter_list = [x.strip() for x in augustus_options.split(';')] for parameter in parameter_list: try: if parameter.find('=') > 0: pattern = r'^--(.+)=(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() parameter_value = mo.group(2).strip() else: pattern = r'^--(.+)$' mo = re.search(pattern, parameter) parameter_name = mo.group(1).strip() except: error_list.append('*** ERROR: the value of the key "augustus_options" in the section "BUSCO parameters" must be NONE or a valid August parameter list.') is_augustus_options_OK = False OK = False break # warn that the results config file is not valid if there are any errors if not OK: error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_busco_name())) # return the control variable and the error list return (OK, error_list)