Esempio n. 1
0
def build_busco_process_script(cluster_name, current_run_dir):
    '''
    Build the current BUSCO process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the BUSCO option dictionary
    busco_option_dict = xlib.get_option_dict(get_busco_config_file())

    # get the options
    experiment_id = busco_option_dict['identification']['experiment_id']
    assembly_software = busco_option_dict['identification']['assembly_software']
    assembly_dataset_id = busco_option_dict['identification']['assembly_dataset_id']
    assembly_type = busco_option_dict['identification']['assembly_type']
    ncpu = busco_option_dict['BUSCO parameters']['ncpu']
    lineage_data = busco_option_dict['BUSCO parameters']['lineage_data']
    lineage_data_file = '{0}.tar.gz'.format(lineage_data)
    lineage_data_url = 'http://busco.ezlab.org/v2/datasets/{0}'.format(lineage_data_file)
    mode = busco_option_dict['BUSCO parameters']['mode'].lower()
    evalue = busco_option_dict['BUSCO parameters']['evalue']
    limit = busco_option_dict['BUSCO parameters']['limit']
    species = busco_option_dict['BUSCO parameters']['species']
    long = busco_option_dict['BUSCO parameters']['long'].upper()
    augustus_options = busco_option_dict['BUSCO parameters']['augustus_options'].upper()

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
        elif  assembly_type == 'SCAFFOLDS':
            transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_star_code():
        transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))

    # write the BUSCO process script
    try:
        if not os.path.exists(os.path.dirname(get_busco_process_script())):
            os.makedirs(os.path.dirname(get_busco_process_script()))
        with open(get_busco_process_script(), mode='w', encoding='utf8', newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('BUSCO_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_busco_bioconda_code())))
            file_id.write('{0}\n'.format('export PATH=$BUSCO_PATH:$PATH'))
            file_id.write('{0}\n'.format('SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_busco_bioconda_code())))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function download_lineage_data'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Downloading lineage data ..."'))
            file_id.write('{0}\n'.format('    wget --quiet --output-document ./{0} {1}'.format(lineage_data_file, lineage_data_url)))
            file_id.write('{0}\n'.format('    tar -xzvf ./{0}'.format(lineage_data_file)))
            file_id.write('{0}\n'.format('    rm ./{0}'.format(lineage_data_file)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function run_busco_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    run_BUSCO.py --version'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format('        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
            file_id.write('{0}\n'.format('        run_BUSCO.py \\'))
            file_id.write('{0}\n'.format('            --cpu={0} \\'.format(ncpu)))
            file_id.write('{0}\n'.format('            --lineage_path=./{0} \\'.format(lineage_data)))
            file_id.write('{0}\n'.format('            --mode={0} \\'.format(mode)))
            file_id.write('{0}\n'.format('            --evalue={0} \\'.format(evalue)))
            file_id.write('{0}\n'.format('            --limit={0} \\'.format(limit)))
            if species.upper() != 'NONE':
                file_id.write('{0}\n'.format('            --species={0} \\'.format(species)))
            if long == 'YES':
                file_id.write('{0}\n'.format('            --long \\'))
            if augustus_options.upper() != 'NONE':
                file_id.write('{0}\n'.format("            --august_options='{0}' \\".format(augustus_options)))
            file_id.write('{0}\n'.format('            --in={0} \\'.format(transcriptome_file)))
            file_id.write('{0}\n'.format('            --out={0}'.format(os.path.basename(current_run_dir))))
            file_id.write('{0}\n'.format('    RC=$?'))
            file_id.write('{0}\n'.format('    if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format('    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_busco_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_busco_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write('{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format('    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('download_lineage_data'))
            file_id.write('{0}\n'.format('run_busco_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(get_busco_process_script()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 2
0
def validate_read_transfer_config_file(strict):
    '''
    Validate the read transfer config file.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the read transfer config file
    read_transfer_config_file = get_read_transfer_config_file()

    # get the options dictionary
    read_transfer_options_dict = xlib.get_option_dict(
        read_transfer_config_file)
    try:
        read_transfer_options_dict = xlib.get_option_dict(
            read_transfer_config_file)
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in read_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = read_transfer_options_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

        # check section "file-1"
        if 'file-1' not in sections_list:
            error_list.append('*** ERROR: the section "file-1" is not found.')
            OK = False

        # check all sections "file-n"
        for section in sections_list:

            if section not in ['identification']:

                # verify than the section identification is like file-n
                if not re.match('^file-[0-9]+$', section):
                    error_list.append(
                        '*** ERROR: the section "{0}" has a wrong identification.'
                        .format(section))
                    OK = False

                else:

                    # check section "file-n" - key "local_path"
                    local_path = read_transfer_options_dict.get(
                        section, {}).get('local_path', not_found)
                    if local_path == not_found:
                        error_list.append(
                            '*** ERROR: the key "local_path" is not found in the section "{0}".'
                            .format(section))
                        OK = False
                    else:
                        try:
                            open(local_path, mode='r').close()
                        except FileNotFoundError:
                            if strict:
                                error_list.append(
                                    '*** ERROR: the file {0} in the key "local_path" of the section "{1}" does not exist or it is not accessible.'
                                    .format(local_path, section))
                                OK = False
                            else:
                                error_list.append(
                                    '*** WARNING: the file {0} in the key "local_path" of the section "{1}" does not exist or it is not accessible.'
                                    .format(local_path, section))
                        except OSError:
                            error_list.append(
                                '*** ERROR: the file name "{0}" in the key "local_path" of the section "{1}" is not correct.'
                                .format(local_path, section))
                            OK = False

    # warn that the reads config file is not valid if there are any errors
    if not OK:
        error_list.append(
            '\nThe read transfer config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 3
0
def check_htseq_count_config_file(strict):
    '''
    Check the htseq-count config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        htseq_count_option_dict = xlib.get_option_dict(
            get_htseq_count_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in htseq_count_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = htseq_count_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = htseq_count_option_dict.get(
                'identification', {}).get('reference_dataset_id', not_found)
            if reference_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "reference_dataset_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "annotation_file"
            annotation_file = htseq_count_option_dict.get(
                'identification', {}).get('annotation_file', not_found)
            if annotation_file == not_found:
                error_list.append(
                    '*** ERROR: the key "annotation_file" is not found in the section "identification".'
                )
                OK = False
            elif os.path.splitext(annotation_file)[1] not in ['.gtf', '.gff']:
                error_list.append(
                    '*** ERROR: the key "annotation_file" has to be a file name with .gtf/.gff extension.'
                )
                OK = False

        # check section "alignment-dataset-1"
        if 'alignment-dataset-1' not in sections_list:
            error_list.append(
                '*** ERROR: the section "alignment-dataset-1" is not found.')
            OK = False

        # check all sections "alignment-dataset-n"
        for section in sections_list:

            if section not in ['identification', 'htseq-count parameters']:

                # check than the section identification is like alignment-dataset-n
                if not re.match('^alignment-dataset-[0-9]+$', section):
                    error_list.append(
                        f'*** ERROR: the section "{section}" has a wrong identification.'
                    )
                    OK = False

                else:

                    # check section "alignment-dataset-n" - key "alignment_software"
                    alignment_software = htseq_count_option_dict.get(
                        section, {}).get('alignment_software', not_found)
                    if alignment_software == not_found:
                        error_list.append(
                            f'*** ERROR: the key "alignment_software" is not found in the section "{section}".'
                        )
                        OK = False
                    elif not xlib.check_code(
                            alignment_software,
                            get_alignment_software_code_list(),
                            case_sensitive=False):
                        error_list.append(
                            f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.'
                        )
                        OK = False

                    # check section "alignment-dataset-n" - key "alignment_dataset_id"
                    alignment_dataset_id = htseq_count_option_dict.get(
                        section, {}).get('alignment_dataset_id', not_found)
                    if alignment_dataset_id == not_found:
                        error_list.append(
                            f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".'
                        )
                        OK = False
                    elif not xlib.check_startswith(
                            alignment_dataset_id,
                            get_alignment_software_code_list(),
                            case_sensitive=True):
                        error_list.append(
                            f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.'
                        )
                        OK = False

        # check section "htseq-count parameters"
        if 'htseq-count parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "htseq-count parameters" is not found.'
            )
            OK = False
        else:

            # check section "htseq-count parameters" - key "nprocesses"
            nprocesses = htseq_count_option_dict.get(
                'htseq-count parameters', {}).get('nprocesses', not_found)
            if nprocesses == not_found:
                error_list.append(
                    '*** ERROR: the key "nprocesses" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_int(nprocesses, minimum=1):
                error_list.append(
                    '*** ERROR: the key "nprocesses" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "htseq-count parameters" - key "stranded"
            stranded = htseq_count_option_dict.get('htseq-count parameters',
                                                   {}).get(
                                                       'stranded', not_found)
            if stranded == not_found:
                error_list.append(
                    '*** ERROR: the key "stranded" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    stranded, get_stranded_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "stranded" has to be {get_stranded_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "minaqual"
            minaqual = htseq_count_option_dict.get('htseq-count parameters',
                                                   {}).get(
                                                       'minaqual', not_found)
            if minaqual == not_found:
                error_list.append(
                    '*** ERROR: the key "minaqual" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_int(minaqual):
                error_list.append(
                    '*** ERROR: the key "minaqual" has to be an integer number.'
                )
                OK = False

            # check section "htseq-count parameters" - key "type"
            type = htseq_count_option_dict.get('htseq-count parameters',
                                               {}).get('type', not_found)
            if type == not_found:
                error_list.append(
                    '*** ERROR: the key "type" is not found in the section "htseq-count parameters".'
                )
                OK = False

            # check section "htseq-count parameters" - key "idattr"
            idattr = htseq_count_option_dict.get('htseq-count parameters',
                                                 {}).get('idattr', not_found)
            if idattr == not_found:
                error_list.append(
                    '*** ERROR: the key "idattr" is not found in the section "htseq-count parameters".'
                )
                OK = False

            # check section "htseq-count parameters" - key "mode"
            mode = htseq_count_option_dict.get('htseq-count parameters',
                                               {}).get('mode', not_found)
            if mode == not_found:
                error_list.append(
                    '*** ERROR: the key "mode" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    mode, get_mode_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "nonunique"
            nonunique = htseq_count_option_dict.get('htseq-count parameters',
                                                    {}).get(
                                                        'nonunique', not_found)
            if nonunique == not_found:
                error_list.append(
                    '*** ERROR: the key "nonunique" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(nonunique,
                                     get_nonunique_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "nonunique" has to be {get_nonunique_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'nprocesses', 'format', 'stranded', 'minaqual', 'type',
                'idattr', 'mode', 'nonunique', 'quiet'
            ]
            other_parameters = htseq_count_option_dict.get(
                'htseq-count parameters', {}).get('other_parameters',
                                                  not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    other_parameters, "other_parameters",
                    not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_htseq_count_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 4
0
def run_gzip_process(cluster_name, dataset_type, log, function=None):
    '''
    Run a gzip process.
    '''

    # initialize the control variable
    OK = True

    # get the gzip code and name
    gzip_code = xlib.get_gzip_code()
    gzip_name = xlib.get_gzip_name()

    # get the gzip option dictionary
    gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type))

    # get the experiment identification
    experiment_id = gzip_option_dict['identification']['experiment_id']

    # get the gzip process script path in the local computer
    gzip_process_script = get_gzip_process_script(dataset_type)

    # get the gzip process starter path in the local computer
    gzip_process_starter = get_gzip_process_starter(dataset_type)

    # warn that the log window does not have to be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('This process might take several minutes. Do not close this window, please wait!\n')

    # check the gzip config file
    log.write(f'{xlib.get_separator()}\n')
    log.write('Checking the {0} config file ...\n'.format(gzip_name))
    (OK, error_list) = check_gzip_config_file(dataset_type, strict=True)
    if OK:
        log.write('The file is OK.\n')
    else:
        log.write('*** ERROR: The config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')

    # create the SSH client connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SSH client ...\n')
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name)
        if OK:
            log.write('The SSH client is connected.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # create the SSH transport connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SSH transport ...\n')
        (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name)
        if OK:
            log.write('The SSH transport is connected.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # create the SFTP client 
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SFTP client ...\n')
        sftp_client = xssh.create_sftp_client(ssh_transport)
        log.write('The SFTP client is connected.\n')

    # warn that the requirements are being verified 
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Checking process requirements ...\n')

    # check the master is running
    if OK:
        (master_state_code, master_state_name) = xec2.get_node_state(cluster_name)
        if master_state_code != 16:
            log.write(f'*** ERROR: The cluster {cluster_name} is not running. Its state is {master_state_code} ({master_state_name}).\n')
            OK = False

    # warn that the requirements are OK 
    if OK:
        log.write('Process requirements are OK.\n')

    # determine the run directory in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Determining the run directory in the cluster ...\n')
        if dataset_type == 'reference':
            current_run_dir = xlib.get_cluster_current_run_dir('reference', gzip_code)
        elif dataset_type == 'database':
            current_run_dir = xlib.get_cluster_current_run_dir('database', gzip_code)
        else:
            current_run_dir = xlib.get_cluster_current_run_dir(experiment_id, gzip_code)
        command = f'mkdir --parents {current_run_dir}'
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write(f'The directory path is {current_run_dir}.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # build the gzip process script
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Building the process script {0} ...\n'.format(gzip_process_script))
        (OK, error_list) = build_gzip_process_script(cluster_name, dataset_type, current_run_dir)
        if OK:
            log.write('The file is built.\n')
        else:
            log.write('*** ERROR: The file could not be built.\n')

    # upload the gzip process script to the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Uploading the process script {0} to the directory {1} ...\n'.format(gzip_process_script, current_run_dir))
        cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_script))
        (OK, error_list) = xssh.put_file(sftp_client, gzip_process_script, cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # set run permision to the gzip process script in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_script)))
        command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_script))
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # build the gzip process starter
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Building the process starter {0} ...\n'.format(gzip_process_starter))
        (OK, error_list) = build_gzip_process_starter(dataset_type, current_run_dir)
        if OK:
            log.write('The file is built.\n')
        else:
            log.write('***ERROR: The file could not be built.\n')

    # upload the gzip process starter to the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Uploading the process starter {0} to the directory {1} ...\n'.format(gzip_process_starter, current_run_dir))
        cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_starter))
        (OK, error_list) = xssh.put_file(sftp_client, gzip_process_starter, cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # set run permision to the gzip process starter in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_starter)))
        command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(gzip_process_starter))
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # submit the gzip process
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Submitting the process script {0}/{1} ...\n'.format(current_run_dir, os.path.basename(gzip_process_starter)))
        OK = xssh.submit_script(cluster_name, ssh_client, current_run_dir, os.path.basename(gzip_process_starter), log)

    # close the SSH transport connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Closing the SSH transport connection ...\n')
        xssh.close_ssh_transport_connection(ssh_transport)
        log.write('The connection is closed.\n')

    # close the SSH client connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Closing the SSH client connection ...\n')
        xssh.close_ssh_client_connection(ssh_client)
        log.write('The connection is closed.\n')

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(f'{xlib.get_separator()}\n')
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 5
0
def build_gzip_process_script(cluster_name, dataset_type, current_run_dir):
    '''
    Build the current gzip process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the gzip option dictionary
    gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type))

    # get the options
    experiment_id = gzip_option_dict['identification']['experiment_id']
    dataset_type_2 = gzip_option_dict['identification']['dataset_type']
    dataset_id = gzip_option_dict['identification']['dataset_id']
    action = gzip_option_dict['gzip parameters']['action']

    # get the sections list
    sections_list = []
    for section in gzip_option_dict.keys():
        sections_list.append(section)
    sections_list.sort()

    # build the dataset subdirectory and file name lists
    dataset_subdirectory_list = []
    file_name_list = []
    for section in sections_list:
        # if the section identification is like library-n
        if re.match('^file-[0-9]+$', section):
            dataset_subdirectory = gzip_option_dict[section]['dataset_subdirectory']
            dataset_subdirectory_list.append(dataset_subdirectory)
            file_name = gzip_option_dict[section]['file_name']
            file_name_list.append(file_name)

    # get the dataset directory
    if dataset_type_2 == 'reference':
        dataset_dir = xlib.get_cluster_reference_dataset_dir(dataset_id)
    elif dataset_type_2 == 'database':
        dataset_dir = xlib.get_cluster_database_dataset_dir(dataset_id)
    elif dataset_type_2 == 'read':
        dataset_dir = xlib.get_cluster_experiment_read_dataset_dir(experiment_id, dataset_id)
    elif dataset_type_2 == 'result':
        dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id)
    elif dataset_type_2 == 'whole-result':
        dataset_dir = xlib.get_cluster_experiment_result_dataset_dir(experiment_id, dataset_id)

    # write the gzip process script
    try:
        if not os.path.exists(os.path.dirname(get_gzip_process_script(dataset_type_2))):
            os.makedirs(os.path.dirname(get_gzip_process_script(dataset_type_2)))
        with open(get_gzip_process_script(dataset_type_2), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id:
            script_file_id.write( '#!/bin/bash\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'SEP="#########################################"\n')
            script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n')
            script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n')
            script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n')
            script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n')
            script_file_id.write( 'mkdir --parents $STATUS_DIR\n')
            script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n')
            script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function init\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write( '    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write( '    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write( '    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( '{0}\n'.format('function run_gzip_process'))
            script_file_id.write( '{\n')
            if dataset_type_2 in ['reference', 'database', 'read', 'result']:
                script_file_id.write(f'    cd {current_run_dir}\n')
                for i in range(len(dataset_subdirectory_list)):
                    script_file_id.write( '    echo "$SEP"\n')
                    script_file_id.write( '{0}\n'.format('    echo "Compressing/decompressing {0}/{1}/{2} ..."'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i])))
                    script_file_id.write( '    /usr/bin/time \\\n')
                    script_file_id.write( '{0}\n'.format('        --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
                    if action == 'compress':
                        script_file_id.write( '{0}\n'.format('        gzip {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i])))
                    elif action == 'decompress':
                        script_file_id.write( '{0}\n'.format('        gzip --decompress {0}/{1}/{2}'.format(dataset_dir, dataset_subdirectory_list[i], file_name_list[i])))
                    script_file_id.write( '    RC=$?\n')
                    script_file_id.write( '{0}\n'.format('    if [ $RC -ne 0 ]; then manage_error gzip $RC; fi'))
            elif dataset_type_2 == 'whole-result':
                script_file_id.write(f'    cd {current_run_dir}\n')
                script_file_id.write( '    echo "$SEP"\n')
                script_file_id.write( '{0}\n'.format('    echo "Compressing/decompressing {0} ..."'.format(dataset_dir)))
                script_file_id.write( '    /usr/bin/time \\\n')
                script_file_id.write( '{0}\n'.format('        --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
                if action == 'compress':
                    script_file_id.write( '{0}\n'.format('        tar --create --gzip --verbose --file={0}.tar.gz {0}'.format(dataset_dir)))
                elif action == 'decompress':
                    script_file_id.write( '{0}\n'.format('        tar --extract --gzip --verbose --file={0} --directory=/'.format(dataset_dir)))
                script_file_id.write( '    RC=$?\n')
                script_file_id.write( '    if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n')
                script_file_id.write( '    echo "$SEP"\n')
                script_file_id.write( '{0}\n'.format('    echo "Removing {0} ..."'.format(dataset_dir)))
                script_file_id.write( '    /usr/bin/time \\\n')
                script_file_id.write( '{0}\n'.format('        --format="Elapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
                script_file_id.write( '{0}\n'.format('        rm -rf {0}'.format(dataset_dir)))
                script_file_id.write( '    RC=$?\n')
                script_file_id.write( '{0}\n'.format('    if [ $RC -ne 0 ]; then manage_error rm $RC; fi'))
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function end\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    calculate_duration\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    send_mail ok\n')
            script_file_id.write( '    touch $SCRIPT_STATUS_OK\n')
            script_file_id.write( '    exit 0\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function manage_error\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    calculate_duration\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write( '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    send_mail wrong\n')
            script_file_id.write( '    touch $SCRIPT_STATUS_WRONG\n')
            script_file_id.write( '    exit 3\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            process_name = f'{xlib.get_gzip_name()} process'
            mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name)
            script_file_id.write( 'function send_mail\n')
            script_file_id.write( '{\n')
            script_file_id.write(f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write( '    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write( '    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write( '    else\n')
            script_file_id.write( '         MESSAGE=""\n')
            script_file_id.write( '    fi\n')
            script_file_id.write( '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write( '    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write( '    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write( '    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function calculate_duration\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write( '    HH=`expr $DURATION / 3600`\n')
            script_file_id.write( '    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write( '    SS=`expr $DURATION % 60`\n')
            script_file_id.write( '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'init\n')
            script_file_id.write( '{0}\n'.format('run_gzip_process'))
            script_file_id.write( 'end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append('*** ERROR: The file {0} can not be created'.format(get_gzip_process_script(dataset_type_2)))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 6
0
def validate_gmap_config_file(strict):
    '''
    Validate the GMAP config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        gmap_option_dict = xlib.get_option_dict(get_gmap_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in gmap_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = gmap_option_dict.get('identification', {}).get('experiment_id', not_found)
            is_experiment_id_OK = True
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                is_experiment_id_OK = False
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = gmap_option_dict.get('identification', {}).get('reference_dataset_id', not_found)
            is_reference_dataset_id_OK = True
            if reference_dataset_id == not_found:
                error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".')
                is_reference_dataset_id_OK = False
                OK = False

            # check section "identification" - key "reference_file"
            reference_file = gmap_option_dict.get('identification', {}).get('reference_file', not_found)
            is_reference_file_OK = True
            if reference_file == not_found:
                error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".')
                is_reference_file_OK = False
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = gmap_option_dict.get('identification', {}).get('assembly_software', not_found)
            is_assembly_software_OK = True
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                is_assembly_software_OK = False
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                is_assembly_software_OK = False
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = gmap_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            is_assembly_dataset_id_OK = True
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                is_assembly_dataset_id_OK = False
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                is_assembly_dataset_id_OK = False
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = gmap_option_dict.get('identification', {}).get('assembly_type', not_found)
            is_assembly_type_OK = True
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                is_assembly_type_OK = False
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    is_assembly_type_OK = False
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    is_assembly_type_OK = False
                    OK = False

        # check section "GMAP parameters"
        if 'GMAP parameters' not in sections_list:
            error_list.append('*** ERROR: the section "GMAP parameters" is not found.')
            OK = False
        else:

            # check section "GMAP parameters" - key "threads"
            threads = gmap_option_dict.get('GMAP parameters', {}).get('threads', not_found)
            is_threads_OK = True
            if threads == not_found:
                error_list.append('*** ERROR: the key "threads" is not found in the section "GMAP parameters".')
                is_threads_OK = False
                OK = False
            else:
                try:
                    if int(threads) < 1:
                        error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_threads_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_threads_OK = False
                    OK = False

            # check section "GMAP parameters" - key "kmer"
            kmer = gmap_option_dict.get('GMAP parameters', {}).get('kmer', not_found)
            is_kmer_OK = True
            if kmer == not_found:
                error_list.append('*** ERROR: the key "kmer" is not found in the section "GMAP parameters".')
                is_kmer_OK = False
                OK = False
            else:
                try:
                    if kmer.upper() != 'NONE' and (int(kmer) < 1 or int(kmer) > 16):
                        error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.')
                        is_kmer_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.')
                    is_kmer_OK = False
                    OK = False

            # check section "GMAP parameters" - key "sampling"
            sampling = gmap_option_dict.get('GMAP parameters', {}).get('sampling', not_found)
            is_sampling_OK = True
            if sampling == not_found:
                error_list.append('*** ERROR: the key "sampling" is not found in the section "GMAP parameters".')
                is_sampling_OK = False
                OK = False
            else:
                try:
                    if sampling.upper() != 'NONE' and int(sampling) < 1:
                        error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.')
                        is_sampling_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.')
                    is_sampling_OK = False
                    OK = False

            # check section "GMAP parameters" - key "input-buffer-size"
            input_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('input-buffer-size', not_found)
            is_input_buffer_size_OK = True
            if input_buffer_size == not_found:
                error_list.append('*** ERROR: the key "input-buffer-size" is not found in the section "GMAP parameters".')
                is_input_buffer_size_OK = False
                OK = False
            else:
                try:
                    if int(input_buffer_size) < 1:
                        error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_input_buffer_size_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_input_buffer_size_OK = False
                    OK = False

            # check section "GMAP parameters" - key "output-buffer-size"
            output_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('output-buffer-size', not_found)
            is_output_buffer_size_OK = True
            if output_buffer_size == not_found:
                error_list.append('*** ERROR: the key "output-buffer-size" is not found in the section "GMAP parameters".')
                is_output_buffer_size_OK = False
                OK = False
            else:
                try:
                    if int(output_buffer_size) < 1:
                        error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_output_buffer_size_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_output_buffer_size_OK = False
                    OK = False

            # check section "GMAP parameters" - key "prunelevel"
            prunelevel = gmap_option_dict.get('GMAP parameters', {}).get('prunelevel', not_found)
            is_prunelevel_OK = True
            if prunelevel == not_found:
                error_list.append('*** ERROR: the key "prunelevel" is not found in the section "GMAP parameters".')
                is_prunelevel_OK = False
                OK = False
            else:
                if prunelevel not in ['0', '1', '2', '3']:
                    error_list.append('*** ERROR: the key "prunelevel" in the section "GMAP parameters" must be 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive).')
                    is_prunelevel_OK = False
                    OK = False

            # check section "GMAP parameters" - key "format"
            format = gmap_option_dict.get('GMAP parameters', {}).get('format', not_found)
            is_format_OK = True
            if format == not_found:
                error_list.append('*** ERROR: the key "format" is not found in the section "GMAP parameters".')
                is_format_OK = False
                OK = False
            else:
                if format.upper() not in ['COMPRESS', 'SUMMARY', 'ALIGN', 'PLS', 'GFF3_GENE', 'SPLICESITES', 'INTRONS', 'MAP_EXONS', 'MAP_RANGES', 'COORDS']:
                    error_list.append('*** ERROR: the key "format" in the section "GMAP parameters" must be COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS.')
                    is_format_OK = False
                    OK = False

            # check section "GMAP parameters" - key "other_parameters"
            not_allowed_parameters_list = ['nthreads', 'kmer', 'sampling', 'input-buffer-size', 'output-buffer-size', 'prunelevel', 'compress', 'summary', 'align', 'format' ]
            other_parameters = gmap_option_dict.get('GMAP parameters', {}).get('other_parameters', not_found)
            is_other_parameters_OK = True
            if other_parameters == not_found:
                error_list.append('*** ERROR: the key "other_parameters" is not found in the section "GMAP parameters".')
                is_other_parameters_OK = False
                OK = False
            else:
                if other_parameters.upper() != 'NONE':
                    parameter_list = [x.strip() for x in other_parameters.split(';')]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append('*** ERROR: the value of the key "other_parameters" in the section "GMAP parameters" must be NONE or a valid parameter list.')
                            is_other_parameters_OK = False
                            OK = False
                            break
                        else:
                            if parameter_name in not_allowed_parameters_list:
                                error_list.append('*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "GMAP parameters" because it is controled by NGScloud.'.format(parameter_name))
                                is_other_parameters_OK = False
                                OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gmap_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 7
0
def upload_database_dataset(cluster_name, log, function=None):
    '''
    Upload the database dataset to the cluster.
    '''

    # initialize the control variable
    OK = True

    # warn that the log window does not have to be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(
            'This process might take several minutes. Do not close this window, please wait!\n'
        )

    # check the database transfer config file
    log.write(f'{xlib.get_separator()}\n')
    log.write('Checking the database transfer config file ...\n')
    if check_database_transfer_config_file(strict=True):
        log.write('The file is OK.\n')
    else:
        log.write(
            '*** ERROR: The database transfer config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')
        OK = False

    # create the SSH client connection
    if OK:
        (OK, error_list,
         ssh_client) = xssh.create_ssh_client_connection(cluster_name)
        for error in error_list:
            log.write(f'{error}\n')

    # create the SSH transport connection
    if OK:
        (OK, error_list,
         ssh_transport) = xssh.create_ssh_transport_connection(cluster_name)
        for error in error_list:
            log.write(f'{error}\n')

    # create the SFTP client
    if OK:
        sftp_client = xssh.create_sftp_client(ssh_transport)

    # upload the database dataset
    if OK:

        # get the option dictionary
        database_transfer_options_dict = xlib.get_option_dict(
            get_database_transfer_config_file())

        # get the database dataset identification and the local directory of the database files
        database_dataset_id = database_transfer_options_dict['identification'][
            'database_dataset_id']
        local_dir = database_transfer_options_dict['identification'][
            'local_dir']

        # set the cluster database directory
        cluster_database_dir = '{0}/{1}'.format(
            xlib.get_cluster_database_dir(), database_dataset_id)

        # create the data directory in the cluster
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            'The database directory {0} in the cluster is being created ...\n'.
            format(cluster_database_dir))
        command = 'mkdir --parents {0}'.format(cluster_database_dir)
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The directory is created.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

        # get the sections list
        sections_list = []
        for section in database_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # for each section "file-n"
        for section in sections_list:

            # check than the section identification is like file-n
            if re.match('^file-[0-9]+$', section):

                # get the file name
                file_name = database_transfer_options_dict[section][
                    'file_name']

                # set the local path and cluster path
                local_path = '{0}/{1}'.format(local_dir, file_name)
                cluster_path = '{0}/{1}'.format(cluster_database_dir,
                                                file_name)

                # upload the database file to the cluster
                log.write(f'{xlib.get_separator()}\n')
                log.write('The file {0} is being uploaded to {1} ...\n'.format(
                    file_name, cluster_database_dir))
                (OK, error_list) = xssh.put_file(sftp_client, local_path,
                                                 cluster_path)
                if OK:
                    log.write('The file has been uploaded.\n')
                else:
                    for error in error_list:
                        log.write(f'{error}\n')
                    break

    # close the SSH transport connection
    if OK:
        xssh.close_ssh_transport_connection(ssh_transport)

    # close the SSH client connection
    if OK:
        xssh.close_ssh_client_connection(ssh_client)

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(f'{xlib.get_separator()}\n')
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 8
0
def build_express_process_script(cluster_name, current_run_dir):
    '''
    Build the current eXpress process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the eXpress option dictionary
    express_option_dict = xlib.get_option_dict(get_express_config_file())

    # get the options
    experiment_id = express_option_dict['identification']['experiment_id']
    assembly_software = express_option_dict['identification']['assembly_software']
    assembly_dataset_id = express_option_dict['identification']['assembly_dataset_id']
    assembly_type = express_option_dict['identification']['assembly_type']
    frag_len_mean = express_option_dict['eXpress parameters']['frag-len-mean']
    frag_len_stddev = express_option_dict['eXpress parameters']['frag-len-stddev']
    library_type = express_option_dict['eXpress parameters']['library_type']
    max_indel_size = express_option_dict['eXpress parameters']['max-indel-size']
    no_bias_correct = express_option_dict['eXpress parameters']['no-bias-correct']
    no_error_model = express_option_dict['eXpress parameters']['no-error-model']
    other_parameters = express_option_dict['eXpress parameters']['other_parameters']

    # get the sections list
    sections_list = []
    for section in express_option_dict.keys():
        sections_list.append(section)
    sections_list.sort()

    # build alignment dataset identification list
    alignment_software_list = []
    alignment_dataset_id_list = []
    for section in sections_list:
        # if the section identification is like library-n
        if re.match('^alignment-dataset-[0-9]+$', section):
            alignment_software_list.append(express_option_dict[section]['alignment_software'])
            alignment_dataset_id_list.append(express_option_dict[section]['alignment_dataset_id'])

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig'
        elif  assembly_type == 'SCAFFOLDS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq'
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa'
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta'
    elif assembly_software == xlib.get_ggtrinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta'
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta'
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta'

    # write the eXpress process script
    try:
        if not os.path.exists(os.path.dirname(get_express_process_script())):
            os.makedirs(os.path.dirname(get_express_process_script()))
        with open(get_express_process_script(), mode='w', encoding='iso-8859-1', newline='\n') as script_file_id:
            script_file_id.write( '#!/bin/bash\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'SEP="#########################################"\n')
            script_file_id.write( 'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write( 'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n')
            script_file_id.write( 'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write( 'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write(f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n')
            script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write(f'CURRENT_DIR={current_run_dir}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write(f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n')
            script_file_id.write(f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n')
            script_file_id.write(f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n')
            script_file_id.write( 'mkdir --parents $STATUS_DIR\n')
            script_file_id.write( 'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n')
            script_file_id.write( 'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function init\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write( '    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write( '    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write( '    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function run_express_process\n')
            script_file_id.write( '{\n')
            script_file_id.write(f'    source activate {xlib.get_express_anaconda_code()}\n')
            script_file_id.write(f'    cd $CURRENT_DIR\n')
            for i in range(len(alignment_dataset_id_list)):
                alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam'
                script_file_id.write(f'    SORTED_BAM_LIST={alignment_dataset_id_list[i]}-sorted-bam-files.txt\n')
                script_file_id.write(f'    ls {alignment_files} > $SORTED_BAM_LIST\n')
                script_file_id.write( '    while read FILE_BAM; do\n')
                script_file_id.write( '        NAME=`basename $FILE_BAM`\n')
                script_file_id.write( '        NAME=${NAME:0:-11}\n')
                script_file_id.write(f'        SUBDIR={alignment_dataset_id_list[i]}-$NAME\n')
                script_file_id.write(f'        mkdir --parents $CURRENT_DIR/$SUBDIR\n')
                script_file_id.write( '        echo "$SEP"\n')
                script_file_id.write(f'        echo "Quantitating alignment dataset {alignment_dataset_id_list[i]} - library $SUBDIR ..."\n')
                script_file_id.write( '        /usr/bin/time \\\n')
                script_file_id.write(f'            --format="{xlib.get_time_output_format(separator=False)}" \\\n')
                script_file_id.write( '            express \\\n')
                script_file_id.write( '                --no-update-check \\\n')
                script_file_id.write(f'                --frag-len-mean {frag_len_mean} \\\n')
                script_file_id.write(f'                --frag-len-stddev {frag_len_stddev} \\\n')
                if library_type.lower() == 'fr-stranded':
                    script_file_id.write( '                --fr-stranded \\\n')
                elif library_type.lower() == 'rf-stranded':
                    script_file_id.write( '                --rf-stranded \\\n')
                elif library_type.lower() == 'f-stranded':
                    script_file_id.write( '                --f-stranded \\\n')
                elif library_type.lower() == 'r-stranded':
                    script_file_id.write( '                --r-stranded \\\n')
                script_file_id.write(f'                --max-indel-size {max_indel_size} \\\n')
                if no_bias_correct.upper() == 'YES':
                    script_file_id.write( '                --no-bias-correct \\\n')
                if no_error_model.upper() == 'YES':
                    script_file_id.write( '                --no-error-model \\\n')
                if other_parameters.upper() != 'NONE':
                    parameter_list = [x.strip() for x in other_parameters.split(';')]
                    for i in range(len(parameter_list)):
                        if parameter_list[i].find('=') > 0:
                            pattern = r'^--(.+)=(.+)$'
                            mo = re.search(pattern, parameter_list[i])
                            parameter_name = mo.group(1).strip()
                            parameter_value = mo.group(2).strip()
                            script_file_id.write(f'                --{parameter_name}={parameter_value} \\\n')
                        else:
                            pattern = r'^--(.+)$'
                            mo = re.search(pattern, parameter_list[i])
                            parameter_name = mo.group(1).strip()
                            script_file_id.write(f'                --{parameter_name} \\\n')
                script_file_id.write( '                --output-dir $CURRENT_DIR/$SUBDIR \\\n')
                script_file_id.write(f'                {transcriptome_file} \\\n')
                script_file_id.write( '                $FILE_BAM\n')
                script_file_id.write( '        RC=$?\n')
                script_file_id.write( '        if [ $RC -ne 0 ]; then manage_error express $RC; fi\n')
                script_file_id.write( '        echo "Quantitation is done."\n')
                script_file_id.write( '    done < $SORTED_BAM_LIST\n')
            script_file_id.write( '    conda deactivate\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function end\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    calculate_duration\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    send_mail ok\n')
            script_file_id.write( '    touch $SCRIPT_STATUS_OK\n')
            script_file_id.write( '    exit 0\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function manage_error\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write( '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n')
            script_file_id.write( '    calculate_duration\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write( '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n')
            script_file_id.write( '    echo "$SEP"\n')
            script_file_id.write( '    send_mail wrong\n')
            script_file_id.write( '    touch $SCRIPT_STATUS_WRONG\n')
            script_file_id.write( '    exit 3\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            process_name = f'{xlib.get_express_name()} process'
            mail_message_ok = xlib.get_mail_message_ok(process_name, cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(process_name, cluster_name)
            script_file_id.write( 'function send_mail\n')
            script_file_id.write( '{\n')
            script_file_id.write(f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write( '    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write( '    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write( '    else\n')
            script_file_id.write( '         MESSAGE=""\n')
            script_file_id.write( '    fi\n')
            script_file_id.write( '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write( '    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n')
            script_file_id.write( '    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write( '    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write( '    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write( '    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'function calculate_duration\n')
            script_file_id.write( '{\n')
            script_file_id.write( '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write( '    HH=`expr $DURATION / 3600`\n')
            script_file_id.write( '    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write( '    SS=`expr $DURATION % 60`\n')
            script_file_id.write( '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n')
            script_file_id.write( '}\n')
            script_file_id.write( '#-------------------------------------------------------------------------------\n')
            script_file_id.write( 'init\n')
            script_file_id.write( 'run_express_process\n')
            script_file_id.write( 'end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(f'*** ERROR: The file {get_express_process_script()} can not be created')
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 9
0
def check_cd_hit_est_config_file(strict):
    '''
    check the CD-HIT-EST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        cd_hit_est_option_dict = xlib.get_option_dict(
            get_cd_hit_est_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in cd_hit_est_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = cd_hit_est_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_code(assembly_software,
                                     get_assembly_software_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id,
                                           get_assembly_software_code_list(),
                                           case_sensitive=True):
                error_list.append(
                    f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                error_list.append(
                    f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.'
                )
                OK = False

        # check section "CD-HIT-EST parameters"
        if 'CD-HIT-EST parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "CD-HIT-EST parameters" is not found.')
            OK = False
        else:

            # check section "CD-HIT-EST parameters" - key "threads"
            threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                 {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(threads, minimum=0):
                error_list.append(
                    '*** ERROR: the key "threads" has to be an integer number greater than or equal to 0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "memory_limit"
            memory_limit = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('memory_limit', not_found)
            if memory_limit == not_found:
                error_list.append(
                    '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(memory_limit, minimum=0):
                error_list.append(
                    '*** ERROR: the key "memory_limit" has to be an integer number greater than or equal to 0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "seq_identity_threshold"
            seq_identity_threshold = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('seq_identity_threshold',
                                                 not_found)
            if seq_identity_threshold == not_found:
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_float(
                    seq_identity_threshold, minimum=0., maximum=1.):
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" has to be a float number between 0.0 and 1.0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "word_length"
            word_length = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('word_length', not_found)
            if word_length == not_found:
                error_list.append(
                    '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(word_length, minimum=1):
                error_list.append(
                    '*** ERROR: the key "word_length" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "mask"
            mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                              {}).get('mask', not_found)
            if mask == not_found:
                error_list.append(
                    '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "match"
            match = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                               {}).get('match', not_found)
            if match == not_found:
                error_list.append(
                    '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(match):
                error_list.append(
                    '*** ERROR: the key "match" has to be an integer number.')
                OK = False

            # check section "CD-HIT-EST parameters" - key "mismatch"
            mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                  {}).get(
                                                      'mismatch', not_found)
            if mismatch == not_found:
                error_list.append(
                    '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(mismatch):
                error_list.append(
                    '*** ERROR: the key "mismatch" has to be an integer number.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch'
            ]
            other_parameters = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    other_parameters, "other_parameters",
                    not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_cd_hit_est_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 10
0
def download_result_dataset(cluster_name, log, function=None):
    '''
    Download the result dataset of a run from the cluster.
    '''
    
    # initialize the control variable
    OK = True

    # get the read transfer config file
    result_transfer_config_file = get_result_transfer_config_file()

    # warn that the log window must not be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('This process might take several minutes. Do not close this window, please wait!\n')

    # get and validate the result transfer config file
    log.write('{0}\n'.format(xlib.get_separator()))
    log.write('The result transfer config file is been validating ...\n')
    if validate_result_transfer_config_file(strict=True):
        log.write('The config file is OK.\n')
    else:
        log.write('*** ERROR: The result transfer config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')
        OK = False

    # create the SSH client connection
    if OK:
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master')
        for error in error_list:
            log.write('{0}\n'.format(error))

    # create the SSH transport connection
    if OK:
        (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name, 'master')
        for error in error_list:
            log.write('{0}\n'.format(error))

    # create the SFTP client 
    if OK:
        sftp_client = xssh.create_sftp_client(ssh_transport)

    # get the options dictionary
    if OK:
        result_transfer_options_dict = xlib.get_option_dict(result_transfer_config_file)

    # download the result dataset
    if OK:

        # get the sections list
        sections_list = []
        for section in result_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # get the experiment identification, run identification and local directory from the section "identification"
        experiment_id = result_transfer_options_dict['identification']['experiment_id']
        result_dataset_id = result_transfer_options_dict['identification']['result_dataset_id']
        status = result_transfer_options_dict['identification']['status'].lower()
        local_dir = result_transfer_options_dict['identification']['local_dir']

        # download files when the status is uncompressed
        if status == 'uncompressed':

            # for each section "file-n"
            for section in sections_list:

                # verify than the section identification is like file-n 
                if re.match('^file-[0-9]+$', section):

                    # get the dataset subdirectory and file name
                    dataset_subdirectory = result_transfer_options_dict[section]['dataset_subdirectory']
                    file_name = result_transfer_options_dict[section]['file_name']

                    # verify if the dataset subdirectory is created
                    pathlib.Path(os.path.normpath('{0}/{1}'.format(local_dir, dataset_subdirectory))).mkdir(parents=True, exist_ok=True)

                    # assign the cluster path and local path
                    cluster_path = '{0}/{1}/{2}/{3}/{4}'.format(xlib.get_cluster_result_dir(), experiment_id, result_dataset_id, dataset_subdirectory, file_name)
                    local_path = os.path.normpath('{0}/{1}/{2}'.format(local_dir, dataset_subdirectory, file_name))

                    # download the result file from the cluster
                    log.write('{0}\n'.format(xlib.get_separator()))
                    log.write('Downloading the file {0} to {1} ...\n'.format(cluster_path, local_dir))
                    (OK, error_list) = xssh.get_file(sftp_client, cluster_path, local_path)
                    if OK:
                        log.write('The file has been downloaded.\n')
                    else:
                        for error in error_list:
                            log.write('{0}\n'.format(error))
                        break

        # download files when the status is compressed
        elif status == 'compressed':

            # assign the cluster path and local path
            cluster_path = '{0}/{1}/{2}'.format(xlib.get_cluster_result_dir(), experiment_id, result_dataset_id)
            local_path = '{0}/{1}'.format(local_dir, result_dataset_id)

            # download the result file from the cluster
            log.write('{0}\n'.format(xlib.get_separator()))
            log.write('Downloading the file {0} to {1} ...\n'.format(cluster_path, local_dir))
            (OK, error_list) = xssh.get_file(sftp_client, cluster_path, local_path)
            if OK:
                log.write('The file has been downloaded.\n')
            else:
                for error in error_list:
                    log.write('{0}\n'.format(error))

    # close the SSH transport connection
    if OK:
        xssh.close_ssh_transport_connection(ssh_transport)

    # close the SSH client connection
    if OK:
        xssh.close_ssh_client_connection(ssh_client)

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 11
0
def check_express_config_file(strict):
    '''
    Check the eXpress config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        express_option_dict = xlib.get_option_dict(get_express_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append('*** ERROR: The option dictionary could not be built from the config file')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in express_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = express_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = express_option_dict.get('identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                OK = False
            elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.')
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = express_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True):
                error_list.append(f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.')
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = express_option_dict.get('identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                    error_list.append(f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.')
                    OK = False

        # check section "alignment-dataset-1"
        if 'alignment-dataset-1' not in sections_list:
            error_list.append('*** ERROR: the section "alignment-dataset-1" is not found.')
            OK = False

        # check all sections "alignment-dataset-n"
        for section in sections_list:

            if section not in ['identification', 'eXpress parameters']:

                # check than the section identification is like alignment-dataset-n 
                if not re.match('^alignment-dataset-[0-9]+$', section):
                    error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.')
                    OK = False

                else:

                    # check section "alignment-dataset-n" - key "alignment_software"
                    alignment_software = express_option_dict.get(section, {}).get('alignment_software', not_found)
                    if alignment_software == not_found:
                        error_list.append(f'*** ERROR: the key "alignment_software" is not found in the section "{section}".')
                        OK = False
                    elif not xlib.check_code(alignment_software, get_alignment_software_code_list(), case_sensitive=False):
                        error_list.append(f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.')
                        OK = False

                    # check section "alignment-dataset-n" - key "alignment_dataset_id"
                    alignment_dataset_id = express_option_dict.get(section, {}).get('alignment_dataset_id', not_found)
                    if alignment_dataset_id == not_found:
                        error_list.append(f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".')
                        OK = False
                    elif not xlib.check_startswith(alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True):
                        error_list.append(f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.')
                        OK = False

        # check section "eXpress parameters"
        if 'eXpress parameters' not in sections_list:
            error_list.append('*** ERROR: the section "eXpress parameters" is not found.')
            OK = False
        else:

            # check section "express parameters" - key "frag-len-mean"
            frag_len_mean = express_option_dict.get('eXpress parameters', {}).get('frag-len-mean', not_found)
            if frag_len_mean == not_found:
                error_list.append('*** ERROR: the key "frag-len-mean" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(frag_len_mean, minimum=1):
                error_list.append('*** ERROR: the key "frag-len-mean" has to be an integer number greater than or equal to 1.')
                OK = False

            # check section "express parameters" - key "frag-len-stddev"
            frag_len_stddev = express_option_dict.get('eXpress parameters', {}).get('frag-len-stddev', not_found)
            if frag_len_stddev == not_found:
                error_list.append('*** ERROR: the key "frag-len-stddev" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(frag_len_stddev, minimum=1):
                error_list.append('*** ERROR: the key "frag-len-stddev" has to be an integer number greater than or equal to 1.')
                OK = False

            # check section "eXpress parameters" - key "library_type"
            library_type = express_option_dict.get('eXpress parameters', {}).get('library_type', not_found)
            if library_type == not_found:
                error_list.append('*** ERROR: the key "library_type" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(library_type, get_library_type_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "library_type" has to be {get_library_type_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "max-indel-size"
            max_indel_size = express_option_dict.get('eXpress parameters', {}).get('max-indel-size', not_found)
            if max_indel_size == not_found:
                error_list.append('*** ERROR: the key "max-indel-size" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(max_indel_size, minimum=0):
                error_list.append('*** ERROR: the key "max-indel-size" has to be an integer number greater than or equal to 0.')
                OK = False

            # check section "eXpress parameters" - key "no-bias-correct"
            no_bias_correct = express_option_dict.get('eXpress parameters', {}).get('no-bias-correct', not_found)
            if no_bias_correct == not_found:
                error_list.append('*** ERROR: the key "no-bias-correct" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(no_bias_correct, get_no_bias_correct_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "no-bias-correct" has to be {get_no_bias_correct_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "no-error-model"
            no_error_model = express_option_dict.get('eXpress parameters', {}).get('no-error-model', not_found)
            if no_error_model == not_found:
                error_list.append('*** ERROR: the key "no-error-model" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(no_error_model, get_no_error_model_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "no-error-model" has to be {get_no_error_model_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "other_parameters"
            not_allowed_parameters_list = ['no-update-check', 'frag-len-mean', 'frag-len-stddev', 'max-indel-size', 'fr-stranded', 'rf-stranded', 'f-stranded', 'r-stranded', 'no-bias-correct', 'no-error-model', 'output-dir']
            other_parameters = express_option_dict.get('eXpress parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append('*** ERROR: the key "other_parameters" is not found in the section "eXpress parameters".')
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(f'\nThe {xlib.get_express_name()} config file is not valid. Please, correct this file or recreate it.')

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 12
0
def validate_result_transfer_config_file(strict):
    '''
    Validate the result transfer config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the result transfer config file
    result_transfer_config_file = get_result_transfer_config_file()


    # get the options dictionary
    try:
        result_transfer_options_dict = xlib.get_option_dict(result_transfer_config_file)
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in result_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = result_transfer_options_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False
            elif not experiment_id.isidentifier():
                error_list.append('*** ERROR: the key "experiment_id" value in the section "identification" has some non-alphanumeric characters')
                OK = False

            # check section "identification" - key "result_dataset_id"

            result_dataset_id = result_transfer_options_dict.get('identification', {}).get('result_dataset_id', not_found)
            if result_dataset_id == not_found:
                error_list.append('*** ERROR: the key "result_dataset_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "status"
            status = result_transfer_options_dict.get('identification', {}).get('status', not_found).lower()
            if status == not_found:
                error_list.append('*** ERROR: the key "status" is not found in the section "identification".')
                OK = False
            else:
                if status not in ['compressed', 'uncompressed']:
                    error_list.append('*** ERROR: the key "status" value in the section "identification" must be uncompressed or compressed.')
                    OK = False
                    status == 'WRONG'

            # check section "identification" - key "local_dir"
            local_dir = result_transfer_options_dict.get('identification', {}).get('local_dir', not_found)
            if local_dir == not_found:
                error_list.append('*** ERROR: the key "local_dir" is not found in the section "identification".')
                OK = False
            elif not xlib.existing_dir(local_dir):
                error_list.append('*** ERROR: the key "local_id" value in the section "identification" is a non existing directory path.')
                OK = False

        # check section "file-1"
        if status == 'uncompressed':
            if 'file-1' not in sections_list:
                error_list.append('*** ERROR: the section "file-1" is not found.')
                OK = False

        # check all sections "file-n"
        if status == 'uncompressed':
            for section in sections_list:

                if section not in ['identification']:

                    # verify than the section identification is like file-n 
                    if not re.match('^file-[0-9]+$', section):
                        error_list.append('*** ERROR: the section "{0}" has a wrong identification.'.format(section))
                        OK = False

                    else:

                        # check section "file-n" - key "dataset_subdirectory"
                        dataset_subdirectory = result_transfer_options_dict.get(section, {}).get('dataset_subdirectory', not_found)
                        if dataset_subdirectory == not_found:
                            error_list.append('*** ERROR: the key "dataset_subdirectory" is not found in the section "{0}".'.format(section))
                            OK = False
                        elif not xlib.is_valid_path(dataset_subdirectory, 'linux'):
                            error_list.append('*** ERROR: the file {0} in the key "dataset_subdirectory" of the section "{1}" has a non valid file name.'.format(dataset_subdirectory, section))
                            OK = False

                        # check section "file-n" - key "file_name"
                        file_name = result_transfer_options_dict.get(section, {}).get('file_name', not_found)
                        if file_name == not_found:
                            error_list.append('*** ERROR: the key "file_name" is not found in the section "{0}".'.format(section))
                            OK = False
                        elif not xlib.is_valid_path(file_name, 'linux'):
                            error_list.append('*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.'.format(file_name, section))
                            OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe result transfer config file is not valid. Please, correct this file or recreate it.')

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 13
0
def build_quast_process_script(cluster_name, current_run_dir):
    '''
    Build the current QUAST process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the QUAST option dictionary
    quast_option_dict = xlib.get_option_dict(get_quast_config_file())

    # get the options
    experiment_id = quast_option_dict['identification']['experiment_id']
    reference_dataset_id = quast_option_dict['identification']['reference_dataset_id']
    reference_file = quast_option_dict['identification']['reference_file']
    assembly_software = quast_option_dict['identification']['assembly_software']
    assembly_dataset_id = quast_option_dict['identification']['assembly_dataset_id']
    assembly_type = quast_option_dict['identification']['assembly_type']
    threads = quast_option_dict['QUAST parameters']['threads']

    # set the reference file path
    if reference_dataset_id.upper() != 'NONE':
        reference_file_path = xlib.get_cluster_reference_file(reference_dataset_id, reference_file)

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type.upper() == 'CONTIGS':
            transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
        elif assembly_type.upper() == 'SCAFFOLDS':
            transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_star_code():
        transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))

    # get the QUAST process script name
    quast_process_script = get_quast_process_script()

    # write the QUAST process script
    try:
        if not os.path.exists(os.path.dirname(quast_process_script)):
            os.makedirs(os.path.dirname(quast_process_script))
        with open(quast_process_script, mode='w', encoding='utf8', newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('QUAST_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_quast_bioconda_code())))
            file_id.write('{0}\n'.format('PATH=$QUAST_PATH:$PATH'))
            file_id.write('{0}\n'.format('SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_quast_bioconda_code())))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function run_quast_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    quast.py --version'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format('        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
            file_id.write('{0}\n'.format('        quast.py \\'))
            file_id.write('{0}\n'.format('            --threads {0} \\'.format(threads)))
            file_id.write('{0}\n'.format('            --output-dir {0} \\'.format(current_run_dir)))
            if reference_dataset_id.upper() != 'NONE':
                file_id.write('{0}\n'.format('            -R {0} \\'.format(reference_file_path)))
            if assembly_type.upper() == 'SCAFFOLDS':
                file_id.write('{0}\n'.format('            --scaffolds \\'))
            file_id.write('{0}\n'.format('            {0}'.format(transcriptome_file)))
            file_id.write('{0}\n'.format('    RC=$?'))
            file_id.write('{0}\n'.format('    if [ $RC -ne 0 ]; then manage_error quast.py $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format('    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_quast_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_quast_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write('{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format('    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('run_quast_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(quast_process_script))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 14
0
def validate_quast_config_file(strict):
    '''
    Validate the QUAST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        quast_option_dict = xlib.get_option_dict(get_quast_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in quast_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = quast_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = quast_option_dict.get('identification', {}).get('reference_dataset_id', not_found)
            if reference_dataset_id == not_found:
                error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "reference_file"
            reference_file = quast_option_dict.get('identification', {}).get('reference_file', not_found)
            if reference_file == not_found:
                error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = quast_option_dict.get('identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = quast_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = quast_option_dict.get('identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    OK = False

        # check section "QUAST parameters"
        if 'QUAST parameters' not in sections_list:
            error_list.append('*** ERROR: the section "QUAST parameters" is not found.')
            OK = False
        else:

            # check section "QUAST parameters" - key "threads"
            threads = quast_option_dict.get('QUAST parameters', {}).get('threads', not_found)
            if threads == not_found:
                error_list.append('*** ERROR: the key "threads" is not found in the section "QUAST parameters".')
                OK = False
            else:
                try:
                    if int(threads) < 1:
                        error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.')
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.')
                    OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_quast_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 15
0
def build_busco_process_script(cluster_name, current_run_dir):
    '''
    Build the current BUSCO process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the BUSCO option dictionary
    busco_option_dict = xlib.get_option_dict(get_busco_config_file())

    # get the options
    experiment_id = busco_option_dict['identification']['experiment_id']
    assembly_software = busco_option_dict['identification'][
        'assembly_software']
    assembly_dataset_id = busco_option_dict['identification'][
        'assembly_dataset_id']
    assembly_type = busco_option_dict['identification']['assembly_type']
    ncpu = busco_option_dict['BUSCO parameters']['ncpu']
    lineage_data_url = busco_option_dict['BUSCO parameters'][
        'lineage_data_url']
    mode = busco_option_dict['BUSCO parameters']['mode'].lower()
    evalue = busco_option_dict['BUSCO parameters']['evalue']
    limit = busco_option_dict['BUSCO parameters']['limit']
    species = busco_option_dict['BUSCO parameters']['species']
    long = busco_option_dict['BUSCO parameters']['long'].upper()
    augustus_options = busco_option_dict['BUSCO parameters'][
        'augustus_options'].upper()

    # get the file and name from the lineage data url
    lineage_data_file = lineage_data_url.split("/")[-1]
    # -- lineage_data = lineage_data_file[:lineage_data_file.find('.tar.gz')]
    point_pos = lineage_data_file.find('.')
    lineage_data = lineage_data_file[:point_pos]

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig'
        elif assembly_type == 'SCAFFOLDS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq'
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa'
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta'
    elif assembly_software == xlib.get_ggtrinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta'
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta'
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta'

    # write the BUSCO process script
    try:
        if not os.path.exists(os.path.dirname(get_busco_process_script())):
            os.makedirs(os.path.dirname(get_busco_process_script()))
        with open(get_busco_process_script(),
                  mode='w',
                  encoding='iso-8859-1',
                  newline='\n') as script_file_id:
            script_file_id.write('#!/bin/bash\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                'SEP="#########################################"\n')
            script_file_id.write(
                'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write(
                'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n'
            )
            script_file_id.write(
                'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write(
                'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n'
            )
            script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n'
            )
            script_file_id.write('mkdir --parents $STATUS_DIR\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(f'CURRENT_DIR={current_run_dir}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function init\n')
            script_file_id.write('{\n')
            script_file_id.write('    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write('    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write('    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write('    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function download_lineage_data\n')
            script_file_id.write('{\n')
            script_file_id.write('    cd $CURRENT_DIR\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Downloading lineage data ..."\n')
            download_script = f'import requests; r = requests.get(\'{lineage_data_url}\') ; open(\'{lineage_data_file}\' , \'wb\').write(r.content)'
            script_file_id.write(
                f'    $MINICONDA3_BIN_PATH/python3 -c "{download_script}"\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error download_script $RC; fi\n'
            )
            script_file_id.write(f'    tar -xzvf ./{lineage_data_file}\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error tar $RC; fi\n')
            script_file_id.write(f'    rm ./{lineage_data_file}\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error rm $RC; fi\n')
            script_file_id.write('    echo "Lineage data are downloaded."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function run_busco_process\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    source activate {xlib.get_busco_anaconda_code()}\n')
            script_file_id.write('    cd $CURRENT_DIR\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Assessing the transcriptome quality ..."\n')
            script_file_id.write('    /usr/bin/time \\\n')
            script_file_id.write(
                f'        --format="{xlib.get_time_output_format(separator=False)}" \\\n'
            )
            script_file_id.write('        busco \\\n')
            script_file_id.write(f'            --cpu={ncpu} \\\n')
            script_file_id.write(
                f'            --lineage_dataset=./{lineage_data} \\\n')
            script_file_id.write(f'            --mode={mode} \\\n')
            script_file_id.write(f'            --evalue={evalue} \\\n')
            script_file_id.write(f'            --limit={limit} \\\n')
            if species.upper() != 'NONE':
                script_file_id.write(f'            --species={species} \\\n')
            if long == 'YES':
                script_file_id.write('            --long \\\n')
            if augustus_options.upper() != 'NONE':
                script_file_id.write(
                    f'            --august_options="{augustus_options}" \\\n')
            script_file_id.write(f'            --in={transcriptome_file} \\\n')
            script_file_id.write(
                f'            --out={os.path.basename(current_run_dir)}\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error run_BUSCO.py $RC; fi\n'
            )
            script_file_id.write('    echo "The assessment is done."\n')
            script_file_id.write('    conda deactivate\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function end\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail ok\n')
            script_file_id.write('    touch $SCRIPT_STATUS_OK\n')
            script_file_id.write('    exit 0\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function manage_error\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail wrong\n')
            script_file_id.write('    touch $SCRIPT_STATUS_WRONG\n')
            script_file_id.write('    exit 3\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            process_name = f'{xlib.get_busco_name()} process'
            mail_message_ok = xlib.get_mail_message_ok(process_name,
                                                       cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(
                process_name, cluster_name)
            script_file_id.write('function send_mail\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write('    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write('    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write('    else\n')
            script_file_id.write('         MESSAGE=""\n')
            script_file_id.write('    fi\n')
            script_file_id.write(
                '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write('    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(
                f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n'
            )
            script_file_id.write('    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write('    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write('    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(
                f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function calculate_duration\n')
            script_file_id.write('{\n')
            script_file_id.write(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write('    HH=`expr $DURATION / 3600`\n')
            script_file_id.write('    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write('    SS=`expr $DURATION % 60`\n')
            script_file_id.write(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('init\n')
            script_file_id.write('download_lineage_data\n')
            script_file_id.write('run_busco_process\n')
            script_file_id.write('end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            f'*** ERROR: The file {get_busco_process_script()} can not be created'
        )
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 16
0
def build_cd_hit_est_process_script(cluster_name, current_run_dir):
    '''
    Build the current CD-HIT-EST process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the option dictionary
    cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file())

    # get the options
    experiment_id = cd_hit_est_option_dict['identification']['experiment_id']
    assembly_software = cd_hit_est_option_dict['identification'][
        'assembly_software']
    assembly_dataset_id = cd_hit_est_option_dict['identification'][
        'assembly_dataset_id']
    assembly_type = cd_hit_est_option_dict['identification']['assembly_type']
    threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads']
    memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'memory_limit']
    seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'seq_identity_threshold']
    word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'word_length']
    mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask']
    match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match']
    mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch']
    other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'other_parameters']

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.contig'
        elif assembly_type == 'SCAFFOLDS':
            transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/{experiment_id}-{assembly_dataset_id}.scafSeq'
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/transabyss-final.fa'
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity.fasta'
    elif assembly_software == xlib.get_ggtrinity_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/Trinity-GG.fasta'
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/clustered-transcriptome.fasta'
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id)}/filtered-transcriptome.fasta'

    # set the output file path
    if OK:
        output_file = f'{current_run_dir}/clustered-transcriptome.fasta'

    # write the CD-HIT-EST process script
    try:
        if not os.path.exists(os.path.dirname(
                get_cd_hit_est_process_script())):
            os.makedirs(os.path.dirname(get_cd_hit_est_process_script()))
        with open(get_cd_hit_est_process_script(),
                  mode='w',
                  encoding='iso-8859-1',
                  newline='\n') as script_file_id:
            script_file_id.write('#!/bin/bash\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                'SEP="#########################################"\n')
            script_file_id.write(
                'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write(
                'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n'
            )
            script_file_id.write(
                'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write(
                'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n'
            )
            script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n'
            )
            script_file_id.write('mkdir --parents $STATUS_DIR\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(f'CURRENT_DIR={current_run_dir}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function init\n')
            script_file_id.write('{\n')
            script_file_id.write('    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write('    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write('    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write('    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function run_cd_hit_est_process\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    source activate {xlib.get_cd_hit_anaconda_code()}\n')
            script_file_id.write('    cd $CURRENT_DIR\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Filtering transcriptome ..."\n')
            script_file_id.write('    /usr/bin/time \\\n')
            script_file_id.write(
                f'        --format="{xlib.get_time_output_format()}" \\\n')
            script_file_id.write('        cd-hit-est \\\n')
            script_file_id.write(f'            -T {threads} \\\n')
            script_file_id.write(f'            -M {memory_limit} \\\n')
            script_file_id.write(f'            -i {transcriptome_file} \\\n')
            script_file_id.write(
                f'            -c {seq_identity_threshold} \\\n')
            script_file_id.write(f'            -n {word_length} \\\n')
            script_file_id.write(f'            -mask {mask} \\\n')
            script_file_id.write(f'            -match {match} \\\n')
            script_file_id.write(f'            -mismatch {mismatch} \\\n')
            if other_parameters.upper() == 'NONE':
                script_file_id.write(f'            -o {output_file}\n')
            else:
                script_file_id.write(f'            -o {output_file} \\\n')
                parameter_list = [
                    x.strip() for x in other_parameters.split(';')
                ]
                for i in range(len(parameter_list)):
                    if parameter_list[i].find('=') > 0:
                        pattern = r'^--(.+)=(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        parameter_value = mo.group(2).strip()
                        if i < len(parameter_list) - 1:
                            script_file_id.write(
                                f'            -{parameter_name} {parameter_value} \\\n'
                            )
                        else:
                            script_file_id.write(
                                f'            -{parameter_name} {parameter_value}\n'
                            )
                    else:
                        pattern = r'^--(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        if i < len(parameter_list):
                            script_file_id.write(
                                f'            -{parameter_name} \\\n')
                        else:
                            script_file_id.write(
                                f'            -{parameter_name}\n')
                    i += 1
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi\n')
            script_file_id.write('    echo "The transcriptome is filtered."\n')
            script_file_id.write('    conda deactivate\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function end\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail ok\n')
            script_file_id.write('    touch $SCRIPT_STATUS_OK\n')
            script_file_id.write('    exit 0\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function manage_error\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail wrong\n')
            script_file_id.write('    touch $SCRIPT_STATUS_WRONG\n')
            script_file_id.write('    exit 3\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            process_name = f'{xlib.get_cd_hit_est_name()} process'
            mail_message_ok = xlib.get_mail_message_ok(process_name,
                                                       cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(
                process_name, cluster_name)
            script_file_id.write('function send_mail\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write('    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write('    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write('    else\n')
            script_file_id.write('         MESSAGE=""\n')
            script_file_id.write('    fi\n')
            script_file_id.write(
                '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write('    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(
                f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n'
            )
            script_file_id.write('    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write('    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write('    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(
                f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function calculate_duration\n')
            script_file_id.write('{\n')
            script_file_id.write(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write('    HH=`expr $DURATION / 3600`\n')
            script_file_id.write('    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write('    SS=`expr $DURATION % 60`\n')
            script_file_id.write(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('init\n')
            script_file_id.write('run_cd_hit_est_process\n')
            script_file_id.write('end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            f'*** ERROR: The file {get_cd_hit_est_process_script()} can not be created'
        )
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 17
0
def run_gmap_process(cluster_name, log, function=None):
    '''
    Run a GMAP process.
    '''

    # initialize the control variable
    OK = True

    # get the GMAP option dictionary
    gmap_option_dict = xlib.get_option_dict(get_gmap_config_file())

    # get the experiment identification
    experiment_id = gmap_option_dict['identification']['experiment_id']

    # warn that the log window must not be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('This process might take several minutes. Do not close this window, please wait!\n')

    # validate the GMAP config file
    log.write('{0}\n'.format(xlib.get_separator()))
    log.write('Validating the {0} config file ...\n'.format(xlib.get_gmap_name()))
    (OK, error_list) = validate_gmap_config_file(strict=True)
    if OK:
        log.write('The config file is OK.\n')
    else:
        log.write('*** ERROR: The config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')

    # create the SSH client connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SSH client ...\n')
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master')
        if OK:
            log.write('The SSH client is connected.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # create the SSH transport connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SSH transport ...\n')
        (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(cluster_name, 'master')
        if OK:
            log.write('The SSH transport is connected.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # create the SFTP client 
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SFTP client ...\n')
        sftp_client = xssh.create_sftp_client(ssh_transport)
        log.write('The SFTP client is connected.\n')

    # warn that the requirements are being verified 
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Verifying process requirements ...\n')

    # verify the master is running
    if OK:
        (master_state_code, master_state_name) = xec2.get_node_state(cluster_name, 'master')
        if master_state_code != 16:
            log.write('*** ERROR: The cluster {0} is not running. Its state is {1} ({2}).\n'.format(cluster_name, master_state_code, master_state_name))
            OK = False

    # verify the GMAP-GSNAP is setup
    if OK:
        (OK, error_list, is_setup) = xbioinfoapp.is_setup_bioconda_package(xlib.get_gmap_gsnap_bioconda_code(), cluster_name, True, ssh_client)
        if OK:
            if not is_setup:
                log.write('*** ERROR: {0} is not setup.\n'.format(xlib.get_gmap_name()))
                OK = False
        else:
            log.write('*** ERROR: The verification of {0} setup could not be performed.\n'.format(xlib.get_gmap_name()))

    # warn that the requirements are OK 
    if OK:
        log.write('Process requirements are OK.\n')

    # determine the run directory in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Determining the run directory in the cluster ...\n')
        current_run_dir = xlib.get_cluster_current_run_dir(experiment_id, xlib.get_gmap_code())
        command = 'mkdir --parents {0}'.format(current_run_dir)
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The directory path is {0}.\n'.format(current_run_dir))
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # build the GMAP process script
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Building the process script {0} ...\n'.format(get_gmap_process_script()))
        (OK, error_list) = build_gmap_process_script(cluster_name, current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('*** ERROR: The file could not be built.\n')

    # upload the GMAP process script in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Uploading the process script {0} in the directory {1} of the master ...\n'.format(get_gmap_process_script(), current_run_dir))
        cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_script()))
        (OK, error_list) = xssh.put_file(sftp_client, get_gmap_process_script(), cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # set run permision to the GMAP process script in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_script())))
        command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_script()))
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # build the GMAP process starter
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Building the process starter {0} ...\n'.format(get_gmap_process_starter()))
        (OK, error_list) = build_gmap_process_starter(current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('***ERROR: The file could not be built.\n')

    # upload the GMAP process starter in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Uploading the process starter {0} in the directory {1} of the master ...\n'.format(get_gmap_process_starter(), current_run_dir))
        cluster_path = '{0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_starter()))
        (OK, error_list) = xssh.put_file(sftp_client, get_gmap_process_starter(), cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # set run permision to the GMAP process starter in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_starter())))
        command = 'chmod u+x {0}/{1}'.format(current_run_dir, os.path.basename(get_gmap_process_starter()))
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # submit the GMAP process
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Submitting the process script {0}/{1} ...\n'.format(current_run_dir, os.path.basename(get_gmap_process_starter())))
        sge_env = xcluster.get_sge_env()
        command = '{0}; qsub -V -b n -cwd {1}/{2}'.format(sge_env, current_run_dir, os.path.basename(get_gmap_process_starter()))
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            for line in stdout:
                log.write('{0}\n'.format(line))
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # close the SSH transport connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Closing the SSH transport connection ...\n')
        xssh.close_ssh_transport_connection(ssh_transport)
        log.write('The connection is closed.\n')

    # close the SSH client connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Closing the SSH client connection ...\n')
        xssh.close_ssh_client_connection(ssh_client)
        log.write('The connection is closed.\n')

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 18
0
def build_infrastructure_software_installation_script(cluster_name):
    '''
    Build the infrastructure software installation script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the connetion data
    (user_id, access_key_id,
     secret_access_key) = xconfiguration.get_basic_aws_data()

    # get the old region and user identification
    current_region_name = xconfiguration.get_current_region_name()

    # get the NGScloud config file
    ngscloud_config_file = xconfiguration.get_ngscloud_config_file()

    # get the option dictionary corresponding to the NGScloud config file
    ngscloud_options_dict = xlib.get_option_dict(ngscloud_config_file)

    # get the dataset structure and NGScloud_volume
    dataset_structure = ngscloud_options_dict['dataset info'][
        'dataset_structure']

    # write the infrastructure software installation script
    try:
        if not os.path.exists(
                os.path.dirname(
                    get_infrastructure_software_installation_script())):
            os.makedirs(
                os.path.dirname(
                    get_infrastructure_software_installation_script()))
        with open(get_infrastructure_software_installation_script(),
                  mode='w',
                  encoding='iso-8859-1',
                  newline='\n') as script_file_id:
            script_file_id.write('#!/bin/bash\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                'SEP="#########################################"\n')
            script_file_id.write(
                'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write(
                'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n'
            )
            script_file_id.write(
                'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write(
                'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function init\n')
            script_file_id.write('{\n')
            script_file_id.write('    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write('    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write('    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write('    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write('}\n')
            if dataset_structure in [
                    xconfiguration.get_dataset_structure_singlevolume(),
                    xconfiguration.get_dataset_structure_none()
            ]:
                script_file_id.write(
                    '#-------------------------------------------------------------------------------\n'
                )
                script_file_id.write('function create_dataset_structure\n')
                script_file_id.write('{\n')
                script_file_id.write('    echo "$SEP"\n')
                script_file_id.write(
                    '    echo "Creating the dataset structure ..."\n')
                script_file_id.write(
                    f'    sudo mkdir --parents {xlib.get_cluster_app_dir()}\n')
                script_file_id.write(
                    f'    sudo mkdir --parents {xlib.get_cluster_database_dir()}\n'
                )
                script_file_id.write(
                    f'    sudo mkdir --parents {xlib.get_cluster_read_dir()}\n'
                )
                script_file_id.write(
                    f'    sudo mkdir --parents {xlib.get_cluster_reference_dir()}\n'
                )
                script_file_id.write(
                    f'    sudo mkdir --parents {xlib.get_cluster_result_dir()}\n'
                )
                script_file_id.write(
                    '    echo "The dataset structure is created."\n')
                script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_awscli\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Installing the AWS CLI ..."\n')
            script_file_id.write(f'    unzip {xlib.get_awscli_name()}.zip\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write('    if [ $RC -ne 0 ]; then unzip $RC; fi\n')
            script_file_id.write('    sudo ./aws/install\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then install $RC; fi\n')
            script_file_id.write('    rm -rf aws\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write('    if [ $RC -ne 0 ]; then rm $RC; fi\n')
            script_file_id.write(f'    rm {xlib.get_awscli_name()}.zip\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write('    if [ $RC -ne 0 ]; then rm $RC; fi\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function setup_aws\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Setting up AWS ..."\n')
            script_file_id.write('    UBUNTU_AWS_DIR=/home/ubuntu/.aws\n')
            script_file_id.write('    mkdir --parents $UBUNTU_AWS_DIR\n')
            script_file_id.write(f'    CONFIG_FILE=$UBUNTU_AWS_DIR/config\n')
            script_file_id.write('    echo "[default]" > $CONFIG_FILE\n')
            script_file_id.write(
                f'    echo "region = {current_region_name}" >> $CONFIG_FILE\n')
            script_file_id.write(
                '    CREDENTIALS_FILE=$UBUNTU_AWS_DIR/credentials\n')
            script_file_id.write('    echo "[default]" > $CREDENTIALS_FILE\n')
            script_file_id.write(
                f'    echo "aws_access_key_id = {access_key_id}" >> $CREDENTIALS_FILE\n'
            )
            script_file_id.write(
                f'    echo "aws_secret_access_key = {secret_access_key}" >> $CREDENTIALS_FILE\n'
            )
            script_file_id.write('    sudo echo "AWS is set up."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function fix_source_list\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Fixing file /etc/apt/sources.list ..."\n')
            script_file_id.write(
                '    sed -i "s/us-east-1.ec2.archive.ubuntu.com/old-releases.ubuntu.com/g" /etc/apt/sources.list\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n')
            script_file_id.write(
                '    sed -i "s/security.ubuntu.com/old-releases.ubuntu\.com/g" /etc/apt/sources.list\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error sed $RC; fi\n')
            script_file_id.write('    apt-get update\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo\n')
            script_file_id.write('    echo "The file is fixed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_xorg\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Installing the package xorg ..."\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install xorg\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_libtbb2\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Installing the package libtbb2 ..."\n')
            script_file_id.write('    echo\n')
            script_file_id.write('    apt-get --assume-yes install libtbb2\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_libxt6\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Installing the package libxt6 ..."\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install libxt6\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_parallel\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Installing the package parallel ..."\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install parallel\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function install_texlive\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Installing the package texlive ..."\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install texlive-latex-base\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install texlive-fonts-recommended\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install texlive-fonts-extra\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write(
                '    sudo apt-get --assume-yes install texlive-latex-extra\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo "The package is installed."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function uninstall_mysql\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Uninstalling MySQL ..."\n')
            script_file_id.write(
                '    sudo apt-get purge --auto-remove --assume-yes mysql-client mysql-client-5.5 mysql-client-core-5.5 mysql-common mysql-server mysql-server-5.5 mysql-server-core-5.5\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error apt-get $RC; fi\n')
            script_file_id.write('    echo "MySQL is uninstalled."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function create_swapfile\n')
            script_file_id.write('{\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Creating a file which will be used for swap ..."\n')
            script_file_id.write(
                '    sudo dd if=/dev/zero of=/swapfile bs=1024 count=2097152\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error dd $RC; fi\n')
            script_file_id.write('    sudo chmod 600 /swapfile\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error chmod $RC; fi\n')
            script_file_id.write('    sudo mkswap /swapfile\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error mkswap $RC; fi\n')
            script_file_id.write('    sudo swapon /swapfile\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error swapon $RC; fi\n')
            script_file_id.write(
                '    sudo echo "/swapfile swap swap defaults 0 0" >> /etc/fstab\n'
            )
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error echo $RC; fi\n')
            script_file_id.write('    echo\n')
            script_file_id.write('    echo "The file is created."\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function end\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail ok\n')
            script_file_id.write('    exit 0\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function manage_error\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail wrong\n')
            script_file_id.write('    exit 3\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            process_name = 'Infrastructure software installation'
            mail_message_ok = xlib.get_mail_message_ok(process_name,
                                                       cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(
                process_name, cluster_name)
            script_file_id.write('function send_mail\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write('    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write('    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write('    else\n')
            script_file_id.write('         MESSAGE=""\n')
            script_file_id.write('    fi\n')
            script_file_id.write(
                '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write('    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(
                f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n'
            )
            script_file_id.write('    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write('    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write('    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(
                f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function calculate_duration\n')
            script_file_id.write('{\n')
            script_file_id.write(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write('    HH=`expr $DURATION / 3600`\n')
            script_file_id.write('    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write('    SS=`expr $DURATION % 60`\n')
            script_file_id.write(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('init\n')
            if dataset_structure in [
                    xconfiguration.get_dataset_structure_singlevolume(),
                    xconfiguration.get_dataset_structure_none()
            ]:
                script_file_id.write('create_dataset_structure\n')
            script_file_id.write('install_awscli\n')
            script_file_id.write('setup_aws\n')
            script_file_id.write('fix_source_list\n')
            script_file_id.write('install_xorg\n')
            script_file_id.write('install_libtbb2\n')
            script_file_id.write('install_libxt6\n')
            script_file_id.write('install_parallel\n')
            script_file_id.write('install_texlive\n')
            script_file_id.write('uninstall_mysql\n')
            # -- script_file_id.write( 'create_swapfile\n')
            script_file_id.write('end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            f'*** ERROR: The file {get_infrastructure_software_installation_script()} can not be created'
        )
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 19
0
def build_gmap_process_script(cluster_name, current_run_dir):
    '''
    Build the current GMAP process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the GMAP option dictionary
    gmap_option_dict = xlib.get_option_dict(get_gmap_config_file())

    # get the options
    experiment_id = gmap_option_dict['identification']['experiment_id']
    reference_dataset_id = gmap_option_dict['identification']['reference_dataset_id']
    reference_file = gmap_option_dict['identification']['reference_file']
    assembly_software = gmap_option_dict['identification']['assembly_software']
    assembly_dataset_id = gmap_option_dict['identification']['assembly_dataset_id']
    assembly_type = gmap_option_dict['identification']['assembly_type']
    threads = gmap_option_dict['GMAP parameters']['threads']
    kmer = gmap_option_dict['GMAP parameters']['kmer']
    sampling = gmap_option_dict['GMAP parameters']['sampling']
    input_buffer_size = gmap_option_dict['GMAP parameters']['input-buffer-size']
    output_buffer_size = gmap_option_dict['GMAP parameters']['output-buffer-size']
    prunelevel = gmap_option_dict['GMAP parameters']['prunelevel']
    format = gmap_option_dict['GMAP parameters']['format']
    other_parameters = gmap_option_dict['GMAP parameters']['other_parameters']

    # set the cluster reference dataset directory
    cluster_reference_dataset_dir = xlib.get_cluster_reference_dataset_dir(reference_dataset_id)

    # set the cluster reference file
    cluster_reference_file = xlib.get_cluster_reference_file(reference_dataset_id, reference_file)

    # set the GMAP database name
    reference_file_name, reference_file_extension = os.path.splitext(reference_file)
    gmap_database = '{0}-gmap_database'.format(reference_file_name)

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type.upper() == 'CONTIGS':
            transcriptome_file = '{0}/{1}-{2}.contig'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
        elif assembly_type.upper() == 'SCAFFOLDS':
            transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id), experiment_id, assembly_dataset_id)
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = '{0}/transabyss-final.fa'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = '{0}/Trinity.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_star_code():
        transcriptome_file = '{0}/Trinity-GG.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(xlib.get_cluster_experiment_result_dataset_dir(experiment_id, assembly_dataset_id))

    # set the output file path
    output_file = 'gmap_output_{0}.txt'.format(format.lower())

    # get the GMAP process script name
    gmap_process_script = get_gmap_process_script()

    # write the GMAP process script
    try:
        if not os.path.exists(os.path.dirname(gmap_process_script)):
            os.makedirs(os.path.dirname(gmap_process_script))
        with open(gmap_process_script, mode='w', encoding='utf8', newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('GMAP_GSNAP_PATH={0}/{1}/envs/{2}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(), xlib.get_gmap_gsnap_bioconda_code())))
            file_id.write('{0}\n'.format('PATH=$GMAP_GSNAP_PATH:$PATH'))
            file_id.write('{0}\n'.format('SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(xlib.get_gmap_gsnap_bioconda_code())))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'.format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function build_gmap_database'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format('        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
            file_id.write('{0}\n'.format('        gmap_build \\'))
            file_id.write('{0}\n'.format('            --dir={0}\\'.format(cluster_reference_dataset_dir)))
            file_id.write('{0}\n'.format('            --db={0}\\'.format(gmap_database)))
            if kmer.upper() != 'NONE':
                file_id.write('{0}\n'.format('            --kmer={0} \\'.format(kmer)))
            file_id.write('{0}\n'.format('            {0}'.format(cluster_reference_file)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function run_gmap_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    gmap --version'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format('        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'))
            file_id.write('{0}\n'.format('        gmap \\'))
            file_id.write('{0}\n'.format('            --nthreads={0} \\'.format(threads)))
            file_id.write('{0}\n'.format('            --dir={0} \\'.format(cluster_reference_dataset_dir)))
            file_id.write('{0}\n'.format('            --db={0} \\'.format(gmap_database)))
            if kmer.upper() != 'NONE':
                file_id.write('{0}\n'.format('            --kmer={0} \\'.format(kmer)))
            if sampling.upper() != 'NONE':
                file_id.write('{0}\n'.format('            --sampling={0} \\'.format(sampling)))
            file_id.write('{0}\n'.format('            --input-buffer-size={0} \\'.format(input_buffer_size)))
            file_id.write('{0}\n'.format('            --output-buffer-size={0} \\'.format(output_buffer_size)))
            file_id.write('{0}\n'.format('            --prunelevel={0} \\'.format(prunelevel)))
            if format.upper() == 'COMPRESS':
                file_id.write('{0}\n'.format('            --compress \\'))
            elif format.upper() == 'SUMMARY':
                file_id.write('{0}\n'.format('            --summary \\'))
            elif format.upper() == 'ALIGN':
                file_id.write('{0}\n'.format('            --align \\'))
            else:
                file_id.write('{0}\n'.format('            --format={0} \\'.format(format.lower())))
            file_id.write('{0}\n'.format('            --ordered \\'))
            file_id.write('{0}\n'.format('            --nofails \\'))
            if other_parameters.upper() != 'NONE':
                parameter_list = [x.strip() for x in other_parameters.split(';')]
                for i in range(len(parameter_list)):
                    if parameter_list[i].find('=') > 0:
                        pattern = r'^--(.+)=(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        parameter_value = mo.group(2).strip()
                        file_id.write('{0}\n'.format('            --{0}={1} \\'.format(parameter_name, parameter_value)))
                    else:
                        pattern = r'^--(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        file_id.write('{0}\n'.format('            --{0} \\'.format(parameter_name)))
            file_id.write('{0}\n'.format('            {0} \\'.format(transcriptome_file)))
            file_id.write('{0}\n'.format('            > {0}'.format(output_file)))
            file_id.write('{0}\n'.format('    RC=$?'))
            file_id.write('{0}\n'.format('    if [ $RC -ne 0 ]; then manage_error gmap $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format('    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format('    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format('    SUBJECT="{0}: {1} process"'.format(xlib.get_project_name(), xlib.get_gmap_name())))
            file_id.write('{0}\n'.format('    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'.format(xlib.get_gmap_name(), cluster_name)))
            file_id.write('{0}\n'.format('    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write('{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format('    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format('#-------------------------------------------------------------------------------'))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('build_gmap_database'))
            file_id.write('{0}\n'.format('run_gmap_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(gmap_process_script))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 20
0
def validate_cd_hit_est_config_file(strict):
    '''
    Validate the CD-HIT-EST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        cd_hit_est_option_dict = xlib.get_option_dict(
            get_cd_hit_est_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in cd_hit_est_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = cd_hit_est_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif assembly_software not in [
                    xlib.get_soapdenovotrans_code(),
                    xlib.get_transabyss_code(),
                    xlib.get_trinity_code(),
                    xlib.get_star_code(),
                    xlib.get_cd_hit_est_code(),
                    xlib.get_transcript_filter_code()
            ]:
                error_list.append(
                    '*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'
                    .format(xlib.get_soapdenovotrans_code(),
                            xlib.get_transabyss_code(),
                            xlib.get_trinity_code(), xlib.get_star_code(),
                            xlib.get_cd_hit_est_code(),
                            xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not assembly_dataset_id.startswith(
                    xlib.get_soapdenovotrans_code()
            ) and not assembly_dataset_id.startswith(xlib.get_transabyss_code(
            )) and not assembly_dataset_id.startswith(xlib.get_trinity_code(
            )) and not assembly_dataset_id.startswith(xlib.get_star_code(
            )) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code(
            )) and not assembly_dataset_id.startswith(
                    xlib.get_transcript_filter_code()):
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'
                    .format(xlib.get_soapdenovotrans_name(),
                            xlib.get_transabyss_name(),
                            xlib.get_trinity_name(), xlib.get_star_name(),
                            xlib.get_cd_hit_est_name(),
                            xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(
                    xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append(
                        '*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'
                        .format(xlib.get_soapdenovotrans_name()))
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code(
            )) or assembly_dataset_id.startswith(xlib.get_trinity_code(
            )) or assembly_dataset_id.startswith(
                    xlib.get_star_code()) or assembly_dataset_id.startswith(
                        xlib.get_cd_hit_est_code(
                        )) or assembly_dataset_id.startswith(
                            xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append(
                        '*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'
                        .format(xlib.get_transabyss_name(),
                                xlib.get_trinity_name(), xlib.get_star_name(),
                                xlib.get_cd_hit_est_name(),
                                xlib.get_transcript_filter_name()))
                    OK = False

        # check section "CD-HIT-EST parameters"
        if 'CD-HIT-EST parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "CD-HIT-EST parameters" is not found.')
            OK = False
        else:

            # check section "CD-HIT-EST parameters" - key "threads"
            threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                 {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(threads) < 0:
                        error_list.append(
                            '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "memory_limit"
            memory_limit = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('memory_limit', not_found)
            if memory_limit == not_found:
                error_list.append(
                    '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(memory_limit) < 0:
                        error_list.append(
                            '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "seq_identity_threshold"
            seq_identity_threshold = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('seq_identity_threshold',
                                                 not_found)
            if seq_identity_threshold == not_found:
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if float(seq_identity_threshold) < 0.0 or float(
                            seq_identity_threshold) > 1.0:
                        error_list.append(
                            '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "word_length"
            word_length = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('word_length', not_found)
            if word_length == not_found:
                error_list.append(
                    '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(word_length) < 1:
                        error_list.append(
                            '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "mask"
            mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                              {}).get('mask',
                                                      not_found).upper()
            if mask == not_found:
                error_list.append(
                    '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "match"
            match = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                               {}).get('match', not_found)
            if match == not_found:
                error_list.append(
                    '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    int(match)
                except:
                    error_list.append(
                        '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "mismatch"
            mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                  {}).get(
                                                      'mismatch', not_found)
            if mismatch == not_found:
                error_list.append(
                    '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    int(mismatch)
                except:
                    error_list.append(
                        '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch'
            ]
            other_parameters = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                if other_parameters.upper() != 'NONE':
                    parameter_list = [
                        x.strip() for x in other_parameters.split(';')
                    ]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append(
                                '*** ERROR: the value of the key "other_parameters" in the section "CD-HIT-EST parameters" must be NONE or a valid parameter list.'
                            )
                            OK = False
                            break
                        if parameter_name in not_allowed_parameters_list:
                            error_list.append(
                                '*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "CD-HIT-EST parameters" because it is controled by {1}.'
                                .format(parameter_name,
                                        xlib.get_project_name()))
                            OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            '\nThe {0} config file is not valid. Please, correct this file or recreate it.'
            .format(xlib.get_cd_hit_est_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 21
0
def check_database_transfer_config_file(strict):
    '''
    Check the database transfer config file.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        database_transfer_options_dict = xlib.get_option_dict(
            get_database_transfer_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in database_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "database_dataset_id"
            database_dataset_id = database_transfer_options_dict.get(
                'identification', {}).get('database_dataset_id', not_found)
            if database_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "database_dataset_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "local_dir"
            local_dir = database_transfer_options_dict.get(
                'identification', {}).get('local_dir', not_found)
            if local_dir == not_found:
                error_list.append(
                    '*** ERROR: the key "local_dir" is not found in the section "identification".'
                )
                OK = False
            else:
                if not os.path.isdir(local_dir):
                    error_list.append(
                        '*** ERROR: {0} is not a directory or does not exist.'.
                        format(local_dir))
                    OK = False

        # check section "file-1"
        if 'file-1' not in sections_list:
            error_list.append('*** ERROR: the section "file-1" is not found.')
            OK = False

        # check all sections "file-n"
        for section in sections_list:

            if section not in ['identification']:

                # check than the section identification is like file-n
                if not re.match('^file-[0-9]+$', section):
                    error_list.append(
                        f'*** ERROR: the section "{section}" has a wrong identification.'
                    )
                    OK = False

                else:

                    # check section "file-n" - key "file_name"
                    file_name = database_transfer_options_dict.get(
                        section, {}).get('file_name', not_found)
                    if file_name == not_found:
                        error_list.append(
                            '*** ERROR: the key "file_name" is not found in the section "{0}".'
                            .format(section))
                        OK = False
                    else:
                        if not os.path.isfile(
                                os.path.join(local_dir, file_name)):
                            error_list.append(
                                '*** ERROR: the file {0} in the key "file_name" does not exist or is not accessible in the local directory {1}.'
                                .format(file_name, local_dir))
                            OK = False

    # warn that the database config file is not valid if there are any errors
    if not OK:
        error_list.append(
            '\nThe database transfer config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 22
0
def build_cd_hit_est_process_script(cluster_name, current_run_dir):
    '''
    Build the current CD-HIT-EST process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the option dictionary
    cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file())

    # get the options
    experiment_id = cd_hit_est_option_dict['identification']['experiment_id']
    assembly_software = cd_hit_est_option_dict['identification'][
        'assembly_software']
    assembly_dataset_id = cd_hit_est_option_dict['identification'][
        'assembly_dataset_id']
    assembly_type = cd_hit_est_option_dict['identification']['assembly_type']
    threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads']
    memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'memory_limit']
    seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'seq_identity_threshold']
    word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'word_length']
    mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask']
    match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match']
    mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch']
    other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'other_parameters']

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = '{0}/{1}-{2}.contig'.format(
                xlib.get_cluster_experiment_result_dataset_dir(
                    experiment_id, assembly_dataset_id), experiment_id,
                assembly_dataset_id)
        elif assembly_type == 'SCAFFOLDS':
            transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(
                xlib.get_cluster_experiment_result_dataset_dir(
                    experiment_id, assembly_dataset_id), experiment_id,
                assembly_dataset_id)
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = '{0}/transabyss-final.fa'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = '{0}/Trinity.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_star_code():
        transcriptome_file = '{0}/Trinity-GG.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))

    # set the output file path
    if OK:
        output_file = '{0}/clustered-transcriptome.fasta'.format(
            current_run_dir)

    # write the CD-HIT-EST process script
    try:
        if not os.path.exists(os.path.dirname(
                get_cd_hit_est_process_script())):
            os.makedirs(os.path.dirname(get_cd_hit_est_process_script()))
        with open(get_cd_hit_est_process_script(),
                  mode='w',
                  encoding='utf8',
                  newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format(
                'CDHIT_PATH={0}/{1}/envs/{2}/bin'.format(
                    xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(),
                    xlib.get_cd_hit_bioconda_code())))
            file_id.write('{0}\n'.format('PATH=$CDHIT_PATH:$PATH'))
            file_id.write('{0}\n'.format(
                'SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(
                xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(
                xlib.get_cd_hit_bioconda_code())))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'
                .format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function run_cd_hit_est_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Running {0} process ..."'.format(
                    xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format(
                '        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'
            ))
            file_id.write('{0}\n'.format('        cd-hit-est \\'))
            file_id.write('{0}\n'.format(
                '            -T {0} \\'.format(threads)))
            file_id.write('{0}\n'.format(
                '            -M {0} \\'.format(memory_limit)))
            file_id.write('{0}\n'.format(
                '            -i {0} \\'.format(transcriptome_file)))
            file_id.write('{0}\n'.format(
                '            -c {0} \\'.format(seq_identity_threshold)))
            file_id.write('{0}\n'.format(
                '            -n {0} \\'.format(word_length)))
            file_id.write('{0}\n'.format(
                '            -mask {0} \\'.format(mask)))
            file_id.write('{0}\n'.format(
                '            -match {0} \\'.format(match)))
            file_id.write('{0}\n'.format(
                '            -mismatch {0} \\'.format(mismatch)))
            if other_parameters.upper() == 'NONE':
                file_id.write('{0}\n'.format(
                    '            -o {0}'.format(output_file)))
            else:
                file_id.write('{0}\n'.format(
                    '            -o {0} \\'.format(output_file)))
                parameter_list = [
                    x.strip() for x in other_parameters.split(';')
                ]
                for i in range(len(parameter_list)):
                    if parameter_list[i].find('=') > 0:
                        pattern = r'^--(.+)=(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        parameter_value = mo.group(2).strip()
                        if i < len(parameter_list) - 1:
                            file_id.write('{0}\n'.format(
                                '            -{0} {1} \\'.format(
                                    parameter_name, parameter_value)))
                        else:
                            file_id.write('{0}\n'.format(
                                '            -{0} {1}'.format(
                                    parameter_name, parameter_value)))
                    else:
                        pattern = r'^--(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        if i < len(parameter_list):
                            file_id.write('{0}\n'.format(
                                '            -{0} \\'.format(parameter_name)))
                        else:
                            file_id.write('{0}\n'.format(
                                '            -{0}'.format(parameter_name)))
                    i += 1
            file_id.write('{0}\n'.format('    RC=$?'))
            file_id.write('{0}\n'.format(
                '    if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_rsem_eval_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write(
                '{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_rsem_eval_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write(
                '{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'
            ))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('run_cd_hit_est_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(
            get_cd_hit_est_process_script()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 23
0
def check_gzip_config_file(dataset_type, strict):
    '''
    Check the gzip config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        gzip_option_dict = xlib.get_option_dict(get_gzip_config_file(dataset_type))
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append('*** ERROR: The option dictionary could not be built from the config file')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in gzip_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = gzip_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False
            elif dataset_type == 'reference' and experiment_id.upper() != 'NONE':
                error_list.append('*** ERROR: the key "experiment_id" has to be always NONE')
                OK = False

            # check section "identification" - key "dataset_type"
            dataset_type_2 = gzip_option_dict.get('identification', {}).get('dataset_type', not_found)
            if dataset_type_2 == not_found:
                error_list.append('*** ERROR: the key "dataset_type" is not found in the section "identification".')
                OK = False
            else:
                if dataset_type in ['reference', 'read']:
                    if dataset_type_2.lower() != dataset_type:
                        error_list.append('*** ERROR: the key "dataset_type" has to be {0}.'.format(dataset_type))
                        OK = False
                elif dataset_type == 'result':
                    if dataset_type_2.lower() not in ['result', 'whole-result']:
                        error_list.append('*** ERROR: the key "dataset_type" has to be result or whole-result.')
                        OK = False

            # check section "identification" - key "dataset_id"
            dataset_id = gzip_option_dict.get('identification', {}).get('dataset_id', not_found)
            if dataset_id == not_found:
                error_list.append('*** ERROR: the key "dataset_id" is not found in the section "identification".')
                OK = False

        # check section "gzip parameters"
        if 'gzip parameters' not in sections_list:
            error_list.append('*** ERROR: the section "gzip parameters" is not found.')
            OK = False
        else:

            # check section "gzip parameters" - key "action"
            action = gzip_option_dict.get('gzip parameters', {}).get('action', not_found)
            if action == not_found:
                error_list.append('*** ERROR: the key "action" is not found in the section "gzip parameters".')
                OK = False
            else:
                if action.lower() not in ['compress', 'decompress']:
                    error_list.append('*** ERROR: the key "action" has to be compress or decompress.')
                    OK = False

        # check section "file-1"
        if dataset_type_2.lower() in ['reference', 'database', 'read', 'result']:
            if 'file-1' not in sections_list:
                error_list.append('*** ERROR: the section "file-1" is not found.')
                OK = False

        # check all sections "file-n"
        if dataset_type_2.lower() in ['reference', 'database', 'read', 'result']:
            for section in sections_list:

                if section not in ['identification', 'gzip parameters']:

                    # check than the section identification is like file-n 
                    if not re.match('^file-[0-9]+$', section):
                        error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.')
                        OK = False

                    else:

                        # check section "file-n" - key "dataset_subdirectory"
                        dataset_subdirectory = gzip_option_dict.get(section, {}).get('dataset_subdirectory', not_found)
                        if dataset_subdirectory == not_found:
                            error_list.append('*** ERROR: the key "dataset_subdirectory" is not found in the section "{0}".'.format(section))
                            OK = False
                        elif not xlib.is_valid_path(dataset_subdirectory, 'linux'):
                            error_list.append('*** ERROR: the file {0} in the key "dataset_subdirectory" of the section "{1}" has a non valid file name.'.format(dataset_subdirectory, section))
                            OK = False

                        # check section "file-n" - key "file_name"
                        file_name = gzip_option_dict.get(section, {}).get('file_name', not_found)
                        if file_name == not_found:
                            error_list.append('*** ERROR: the key "file_name" is not found in the section "{0}".'.format(section))
                            OK = False
                        elif not xlib.is_valid_path(file_name, 'linux'):
                            error_list.append('*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.'.format(file_name, section))
                            OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gzip_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 24
0
def validate_fastqc_config_file(strict):
    '''
    Validate the FastQC config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in fastqc_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = fastqc_option_dict.get('identification', {}).get(
                'experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "read_dataset_id"
            read_dataset_id = fastqc_option_dict.get('identification', {}).get(
                'read_dataset_id', not_found)
            if read_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "read_dataset_id" is not found in the section "identification".'
                )
                OK = False

        # check section "FastQC parameters"
        if 'FastQC parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "FastQC parameters" is not found.')
            OK = False
        else:

            # check section "FastQC parameters" - key "threads"
            threads = fastqc_option_dict.get('FastQC parameters',
                                             {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "FastQC parameters".'
                )
                OK = False
            else:
                try:
                    if int(threads) < 1:
                        error_list.append(
                            '*** ERROR: the key "threads" in the section "FastQC parameters" must be an integer value greater or equal to 1.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "threads" in the section "FastQC parameters" must be an integer value greater or equal to 1.'
                    )
                    OK = False

        # check section "file-1"
        if 'file-1' not in sections_list:
            error_list.append('*** ERROR: the section "file-1" is not found.')
            OK = False

        # check all sections "file-n"
        for section in sections_list:

            if section not in ['identification', 'FastQC parameters']:

                # verify than the section identification is like file-n
                if not re.match('^file-[0-9]+$', section):
                    error_list.append(
                        '*** ERROR: the section "{0}" has a wrong identification.'
                        .format(section))
                    OK = False

                else:

                    # check section "file-n" - key "file_name"
                    file_name = fastqc_option_dict.get(section, {}).get(
                        'file_name', not_found)
                    if file_name == not_found:
                        error_list.append(
                            '*** ERROR: the key "file_name" is not found in the section "{0}".'
                            .format(section))
                        OK = False
                    elif not xlib.is_valid_path(file_name, 'linux'):
                        error_list.append(
                            '*** ERROR: the file {0} in the key "file_name" of the section "{1}" has a non valid file name.'
                            .format(file_name, section))
                        OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            '\nThe {0} config file is not valid. Please, correct this file or recreate it.'
            .format(xlib.get_fastqc_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 25
0
def upload_read_dataset(cluster_name, log, function=None):
    '''
    Upload the read dataset to the cluster.
    '''

    # initialize the control variable
    OK = True

    # get the read transfer config file
    read_transfer_config_file = get_read_transfer_config_file()

    # warn that the log window must not be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(
            'This process might take several minutes. Do not close this window, please wait!\n'
        )

    # get and validate the read transfer config file
    log.write('{0}\n'.format(xlib.get_separator()))
    log.write('The read transfer config file is been validating ...\n')
    if validate_read_transfer_config_file(strict=True):
        log.write('The config file is OK.\n')
    else:
        log.write('*** ERROR: The read transfer config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')
        OK = False

    # create the SSH client connection
    if OK:
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(
            cluster_name, 'master')
        for error in error_list:
            log.write('{0}\n'.format(error))

    # create the SSH transport connection
    if OK:
        (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(
            cluster_name, 'master')
        for error in error_list:
            log.write('{0}\n'.format(error))

    # create the SFTP client
    if OK:
        sftp_client = xssh.create_sftp_client(ssh_transport)

    # get the options dictionary
    if OK:
        read_transfer_options_dict = xlib.get_option_dict(
            read_transfer_config_file)

    # get the experiment identification and create the experiment reads directory
    if OK:

        # get the experiment identification
        experiment_id = read_transfer_options_dict['identification'][
            'experiment_id']

        # Get the directory of read and results datasets of the experiment
        cluster_experiment_reads_dir = xlib.get_cluster_experiment_read_dataset_dir(
            experiment_id, xlib.get_uploaded_read_dataset_name())
        cluster_experiment_result_dir = xlib.get_cluster_experiment_result_dir(
            experiment_id)

        # create the experiment reads directory
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write(
            'The reads directory {0} in the cluster is being created ...\n'.
            format(cluster_experiment_reads_dir))
        command = 'mkdir --parents {0}'.format(cluster_experiment_reads_dir)
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The directory is created.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

        # create the experiment run result directory
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write(
            'The run result directory {0} in the cluster is being created ...\n'
            .format(cluster_experiment_result_dir))
        command = 'mkdir --parents {0}'.format(cluster_experiment_result_dir)
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The directory is created.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # upload the read dataset
    if OK:

        # get the sections list
        sections_list = []
        for section in read_transfer_options_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # for each section "file-n"
        for section in sections_list:

            # verify than the section identification is like file-n
            if re.match('^file-[0-9]+$', section):

                # get local path and cluster directory
                local_path = read_transfer_options_dict[section]['local_path']

                # upload the reference file in the cluster
                log.write('{0}\n'.format(xlib.get_separator()))
                log.write('The file {0} is being uploaded to {1} ...\n'.format(
                    local_path, cluster_experiment_reads_dir))
                cluster_path = '{0}/{1}'.format(cluster_experiment_reads_dir,
                                                os.path.basename(local_path))
                (OK, error_list) = xssh.put_file(sftp_client, local_path,
                                                 cluster_path)
                if OK:
                    log.write('The file has been uploaded.\n')
                else:
                    for error in error_list:
                        log.write('{0}\n'.format(error))
                    break

    # close the SSH transport connection
    if OK:
        xssh.close_ssh_transport_connection(ssh_transport)

    # close the SSH client connection
    if OK:
        xssh.close_ssh_client_connection(ssh_client)

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 26
0
def build_fastqc_process_script(cluster_name, current_run_dir):
    '''
    Build the current FastQC process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the FastQC option dictionary
    fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file())

    # get the options
    experiment_id = fastqc_option_dict['identification']['experiment_id']
    read_dataset_id = fastqc_option_dict['identification']['read_dataset_id']
    threads = fastqc_option_dict['FastQC parameters']['threads']

    # get the sections list
    sections_list = []
    for section in fastqc_option_dict.keys():
        sections_list.append(section)
    sections_list.sort()

    # build the file name list
    file_name_list = []
    for section in sections_list:
        # if the section identification is like library-n
        if re.match('^file-[0-9]+$', section):
            file_name = fastqc_option_dict[section]['file_name']
            file_name_list.append(file_name)

    # write the FastQC process script
    try:
        if not os.path.exists(os.path.dirname(get_fastqc_process_script())):
            os.makedirs(os.path.dirname(get_fastqc_process_script()))
        with open(get_fastqc_process_script(),
                  mode='w',
                  encoding='utf8',
                  newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format(
                'FASTQC_PATH={0}/{1}/envs/{2}/bin'.format(
                    xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(),
                    xlib.get_fastqc_bioconda_code())))
            file_id.write('{0}\n'.format('PATH=$FASTQC_PATH:$PATH'))
            file_id.write('{0}\n'.format(
                'SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(
                xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(
                xlib.get_fastqc_bioconda_code())))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'
                .format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function run_fastqc_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    fastqc --version'))
            for file_name in file_name_list:
                file_id.write('{0}\n'.format('    echo "$SEP"'))
                file_id.write('{0}\n'.format('    /usr/bin/time \\'))
                file_id.write('{0}\n'.format(
                    '        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'
                ))
                file_id.write('{0}\n'.format('        fastqc \\'))
                file_id.write('{0}\n'.format('            {0} \\'.format(
                    xlib.get_cluster_read_file(experiment_id, read_dataset_id,
                                               file_name))))
                file_id.write('{0}\n'.format(
                    '            --threads={0} \\'.format(threads)))
                file_id.write('{0}\n'.format(
                    '            --outdir={0}'.format(current_run_dir)))
                file_id.write('{0}\n'.format('    RC=$?'))
                file_id.write('{0}\n'.format(
                    '    if [ $RC -ne 0 ]; then manage_error fastqc $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_fastqc_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_fastqc_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write(
                '{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_fastqc_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {1} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_fastqc_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write(
                '{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'
            ))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('run_fastqc_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(
            get_fastqc_process_script()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 27
0
def run_htseq_count_process(cluster_name, log, function=None):
    '''
    Run a htseq-count process.
    '''

    # initialize the control variable
    OK = True

    # get the htseq-count option dictionary
    htseq_count_option_dict = xlib.get_option_dict(
        get_htseq_count_config_file())

    # get the experiment identification
    experiment_id = htseq_count_option_dict['identification']['experiment_id']

    # warn that the log window does not have to be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(
            'This process might take several minutes. Do not close this window, please wait!\n'
        )

    # check the htseq-count config file
    log.write(f'{xlib.get_separator()}\n')
    log.write(f'Checking the {xlib.get_htseq_count_name()} config file ...\n')
    (OK, error_list) = check_htseq_count_config_file(strict=True)
    if OK:
        log.write('The file is OK.\n')
    else:
        log.write('*** ERROR: The config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')

    # create the SSH client connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SSH client ...\n')
        (OK, error_list,
         ssh_client) = xssh.create_ssh_client_connection(cluster_name)
        if OK:
            log.write('The SSH client is connected.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # create the SSH transport connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SSH transport ...\n')
        (OK, error_list,
         ssh_transport) = xssh.create_ssh_transport_connection(cluster_name)
        if OK:
            log.write('The SSH transport is connected.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # create the SFTP client
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Connecting the SFTP client ...\n')
        sftp_client = xssh.create_sftp_client(ssh_transport)
        log.write('The SFTP client is connected.\n')

    # warn that the requirements are being verified
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Checking process requirements ...\n')

    # check the master is running
    if OK:
        (master_state_code,
         master_state_name) = xec2.get_node_state(cluster_name)
        if master_state_code != 16:
            log.write(
                f'*** ERROR: The cluster {cluster_name} is not running. Its state is {master_state_code} ({master_state_name}).\n'
            )
            OK = False

    # check HTSeq is installed
    if OK:
        (OK, error_list,
         is_installed) = xbioinfoapp.is_installed_anaconda_package(
             xlib.get_htseq_anaconda_code(), cluster_name, True, ssh_client)
        if OK:
            if not is_installed:
                log.write(
                    f'*** ERROR: {xlib.get_htseq_name()} is not installed.\n')
                OK = False
        else:
            log.write(
                f'*** ERROR: The verification of {xlib.get_htseq_name()} installation could not be performed.\n'
            )

    # warn that the requirements are OK
    if OK:
        log.write('Process requirements are OK.\n')

    # determine the run directory in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Determining the run directory in the cluster ...\n')
        current_run_dir = xlib.get_cluster_current_run_dir(
            experiment_id, xlib.get_htseq_count_code())
        command = f'mkdir --parents {current_run_dir}'
        (OK, _, _) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write(f'The directory path is {current_run_dir}.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # build the htseq-count process script
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Building the process script {get_htseq_count_process_script()} ...\n'
        )
        (OK, error_list) = build_htseq_count_process_script(
            cluster_name, current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('*** ERROR: The file could not be built.\n')

    # upload the htseq-count process script to the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Uploading the process script {get_htseq_count_process_script()} to the directory {current_run_dir} ...\n'
        )
        cluster_path = f'{current_run_dir}/{os.path.basename(get_htseq_count_process_script())}'
        (OK, error_list) = xssh.put_file(sftp_client,
                                         get_htseq_count_process_script(),
                                         cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # set run permision to the htseq-count process script in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Setting on the run permision of {current_run_dir}/{os.path.basename(get_htseq_count_process_script())} ...\n'
        )
        command = f'chmod u+x {current_run_dir}/{os.path.basename(get_htseq_count_process_script())}'
        (OK, _, _) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # build the htseq-count process starter
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Building the process starter {get_htseq_count_process_starter()} ...\n'
        )
        (OK, error_list) = build_htseq_count_process_starter(current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('***ERROR: The file could not be built.\n')

    # upload the htseq-count process starter to the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Uploading the process starter {get_htseq_count_process_starter()} to the directory {current_run_dir} ...\n'
        )
        cluster_path = f'{current_run_dir}/{os.path.basename(get_htseq_count_process_starter())}'
        (OK, error_list) = xssh.put_file(sftp_client,
                                         get_htseq_count_process_starter(),
                                         cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write(f'{error}\n')

    # set run permision to the htseq-count process starter in the cluster
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Setting on the run permision of {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())} ...\n'
        )
        command = f'chmod u+x {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())}'
        (OK, _, _) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write(f'*** ERROR: Wrong command ---> {command}\n')

    # submit the htseq-count process
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write(
            f'Submitting the process script {current_run_dir}/{os.path.basename(get_htseq_count_process_starter())} ...\n'
        )
        OK = xssh.submit_script(
            cluster_name, ssh_client, current_run_dir,
            os.path.basename(get_htseq_count_process_starter()), log)

    # close the SSH transport connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Closing the SSH transport connection ...\n')
        xssh.close_ssh_transport_connection(ssh_transport)
        log.write('The connection is closed.\n')

    # close the SSH client connection
    if OK:
        log.write(f'{xlib.get_separator()}\n')
        log.write('Closing the SSH client connection ...\n')
        xssh.close_ssh_client_connection(ssh_client)
        log.write('The connection is closed.\n')

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(f'{xlib.get_separator()}\n')
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 28
0
def check_busco_config_file(strict):
    '''
    Check the BUSCO config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        busco_option_dict = xlib.get_option_dict(get_busco_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in busco_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = busco_option_dict.get('identification', {}).get(
                'experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = busco_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_code(assembly_software,
                                     get_assembly_software_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = busco_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id,
                                           get_assembly_software_code_list(),
                                           case_sensitive=True):
                error_list.append(
                    f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = busco_option_dict.get('identification', {}).get(
                'assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                error_list.append(
                    f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.'
                )
                OK = False

        # check section "BUSCO parameters"
        if 'BUSCO parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "BUSCO parameters" is not found.')
            OK = False
        else:

            # check section "BUSCO parameters" - key "ncpu"
            ncpu = busco_option_dict.get('BUSCO parameters',
                                         {}).get('ncpu', not_found)
            if ncpu == not_found:
                error_list.append(
                    '*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_int(ncpu, minimum=1):
                error_list.append(
                    '*** ERROR: the key "ncpu" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "BUSCO parameters" - key "lineage_data_url"
            lineage_data_url = busco_option_dict.get(
                'BUSCO parameters', {}).get('lineage_data_url', not_found)
            if lineage_data_url == not_found:
                error_list.append(
                    '*** ERROR: the key "lineage_data_url" is not found in the section "BUSCO parameters"'
                )
                OK = False
            else:
                try:
                    urllib.request.urlopen(lineage_data_url)
                except Exception as e:
                    error_list.append(f'*** EXCEPTION: "{e}".')
                    error_list.append(
                        '*** ERROR: the key "lineage_data_url" has to be a reachable address.'
                    )
                    OK = False

            # check section "BUSCO parameters" - key "mode"
            mode = busco_option_dict.get('BUSCO parameters',
                                         {}).get('mode', not_found)
            if mode == not_found:
                error_list.append(
                    '*** ERROR: the key "mode" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    mode, get_mode_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.'
                )
                OK = False

            # check section "BUSCO parameters" - key "evalue"
            evalue = busco_option_dict.get('BUSCO parameters',
                                           {}).get('evalue', not_found)
            if evalue == not_found:
                error_list.append(
                    '*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_float(evalue, minimum=0., mne=1E-12):
                error_list.append(
                    '*** ERROR: the key "evalue" has to be a float number greater than 0.'
                )
                OK = False

            # check section "BUSCO parameters" - key "limit"
            limit = busco_option_dict.get('BUSCO parameters',
                                          {}).get('limit', not_found)
            if limit == not_found:
                error_list.append(
                    '*** ERROR: the key "limit" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_int(limit, minimum=1):
                error_list.append(
                    '*** ERROR: the key "limit" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "BUSCO parameters" - key "species"
            species = busco_option_dict.get('BUSCO parameters',
                                            {}).get('species', not_found)
            if species == not_found:
                error_list.append(
                    '*** ERROR: the key "species" is not found in the section "BUSCO parameters"'
                )
                OK = False

            # check section "BUSCO parameters" - key "long"
            long = busco_option_dict.get('BUSCO parameters',
                                         {}).get('long', not_found)
            if long == not_found:
                error_list.append(
                    '*** ERROR: the key "long" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    long, get_long_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "long" has to be {get_long_code_list_text()}.'
                )
                OK = False

            # check section "BUSCO parameters" - key "augustus_options"
            augustus_options = busco_option_dict.get(
                'BUSCO parameters', {}).get('augustus_options', not_found)
            if augustus_options == not_found:
                error_list.append(
                    '*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif augustus_options.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    augustus_options, "augustus_options", [])
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_busco_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 29
0
def build_htseq_count_process_script(cluster_name, current_run_dir):
    '''
    Build the current htseq-count process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the htseq-count option dictionary
    htseq_count_option_dict = xlib.get_option_dict(
        get_htseq_count_config_file())

    # get the options
    experiment_id = htseq_count_option_dict['identification']['experiment_id']
    reference_dataset_id = htseq_count_option_dict['identification'][
        'reference_dataset_id']
    annotation_file = htseq_count_option_dict['identification'][
        'annotation_file']
    nprocesses = htseq_count_option_dict['htseq-count parameters'][
        'nprocesses']
    stranded = htseq_count_option_dict['htseq-count parameters']['stranded']
    minaqual = htseq_count_option_dict['htseq-count parameters']['minaqual']
    type = htseq_count_option_dict['htseq-count parameters']['type']
    idattr = htseq_count_option_dict['htseq-count parameters']['idattr']
    mode = htseq_count_option_dict['htseq-count parameters']['mode']
    nonunique = htseq_count_option_dict['htseq-count parameters']['nonunique']
    other_parameters = htseq_count_option_dict['htseq-count parameters'][
        'other_parameters']

    # get the sections list
    sections_list = []
    for section in htseq_count_option_dict.keys():
        sections_list.append(section)
    sections_list.sort()

    # build alignment dataset identification list
    alignment_software_list = []
    alignment_dataset_id_list = []
    for section in sections_list:
        # if the section identification is like library-n
        if re.match('^alignment-dataset-[0-9]+$', section):
            alignment_software_list.append(
                htseq_count_option_dict[section]['alignment_software'])
            alignment_dataset_id_list.append(
                htseq_count_option_dict[section]['alignment_dataset_id'])

    # set the annotation file path
    annotation_file = xlib.get_cluster_reference_file(reference_dataset_id,
                                                      annotation_file)

    # write the htseq-count process script
    try:
        if not os.path.exists(os.path.dirname(
                get_htseq_count_process_script())):
            os.makedirs(os.path.dirname(get_htseq_count_process_script()))
        with open(get_htseq_count_process_script(),
                  mode='w',
                  encoding='iso-8859-1',
                  newline='\n') as script_file_id:
            script_file_id.write('#!/bin/bash\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                'SEP="#########################################"\n')
            script_file_id.write(
                'export HOST_IP=`curl --silent checkip.amazonaws.com`\n')
            script_file_id.write(
                'export HOST_ADDRESS="ec2-${HOST_IP//./-}-compute-1.amazonaws.com"\n'
            )
            script_file_id.write(
                'export AWS_CONFIG_FILE=/home/ubuntu/.aws/config\n')
            script_file_id.write(
                'export AWS_SHARED_CREDENTIALS_FILE=/home/ubuntu/.aws/credentials\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'MINICONDA3_BIN_PATH={xlib.get_cluster_app_dir()}/{xlib.get_miniconda3_name()}/bin\n'
            )
            script_file_id.write(f'export PATH=$MINICONDA3_BIN_PATH:$PATH\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(
                f'STATUS_DIR={xlib.get_status_dir(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_OK={xlib.get_status_ok(current_run_dir)}\n')
            script_file_id.write(
                f'SCRIPT_STATUS_WRONG={xlib.get_status_wrong(current_run_dir)}\n'
            )
            script_file_id.write('mkdir --parents $STATUS_DIR\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_OK ]; then rm $SCRIPT_STATUS_OK; fi\n')
            script_file_id.write(
                'if [ -f $SCRIPT_STATUS_WRONG ]; then rm $SCRIPT_STATUS_WRONG; fi\n'
            )
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write(f'CURRENT_DIR={current_run_dir}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function init\n')
            script_file_id.write('{\n')
            script_file_id.write('    INIT_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script started at $FORMATTED_INIT_DATETIME+00:00."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(f'    echo "CLUSTER: {cluster_name}"\n')
            script_file_id.write('    echo "HOST NAME: $HOSTNAME"\n')
            script_file_id.write('    echo "HOST IP: $HOST_IP"\n')
            script_file_id.write('    echo "HOST ADDRESS: $HOST_ADDRESS"\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function print_htseq_count_version\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    source activate {xlib.get_htseq_anaconda_code()}\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    # -- htseq-count --version\n')
            script_file_id.write('    conda deactivate\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function run_htseq_count_process\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    source activate {xlib.get_htseq_anaconda_code()}\n')
            script_file_id.write('    cd $CURRENT_DIR\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "Counting reads ..."\n')
            script_file_id.write('    /usr/bin/time \\\n')
            script_file_id.write(
                f'        --format="{xlib.get_time_output_format(separator=False)}" \\\n'
            )
            script_file_id.write('        htseq-count \\\n')
            script_file_id.write(f'            --nprocesses={nprocesses} \\\n')
            script_file_id.write('            --format=bam \\\n')
            script_file_id.write(
                f'            --stranded={stranded.lower()} \\\n')
            script_file_id.write(f'            --minaqual={minaqual} \\\n')
            script_file_id.write(f'            --type={type} \\\n')
            script_file_id.write(f'            --idattr={idattr} \\\n')
            script_file_id.write(f'            --mode={mode.lower()} \\\n')
            script_file_id.write(
                f'            --nonunique={nonunique.lower()} \\\n')
            script_file_id.write('            --quiet \\\n')
            if other_parameters.upper() != 'NONE':
                parameter_list = [
                    x.strip() for x in other_parameters.split(';')
                ]
                for i in range(len(parameter_list)):
                    if parameter_list[i].find('=') > 0:
                        pattern = r'^--(.+)=(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        parameter_value = mo.group(2).strip()
                        script_file_id.write(
                            f'            --{parameter_name}={parameter_value} \\\n'
                        )
                    else:
                        pattern = r'^--(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        script_file_id.write(
                            f'            --{parameter_name} \\\n')
            for i in range(len(alignment_dataset_id_list)):
                alignment_files = f'{xlib.get_cluster_experiment_result_dataset_dir(experiment_id, alignment_dataset_id_list[i])}/*.sorted.bam'
                script_file_id.write(f'            {alignment_files} \\\n')
            script_file_id.write(f'            {annotation_file} \\\n')
            script_file_id.write(f'            > read-count.txt\n')
            script_file_id.write('    RC=$?\n')
            script_file_id.write(
                '    if [ $RC -ne 0 ]; then manage_error htseq-count $RC; fi\n'
            )
            script_file_id.write('    echo "Reads are counted."\n')
            script_file_id.write('    conda deactivate\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function end\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail ok\n')
            script_file_id.write('    touch $SCRIPT_STATUS_OK\n')
            script_file_id.write('    exit 0\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function manage_error\n')
            script_file_id.write('{\n')
            script_file_id.write('    END_DATETIME=`date --utc +%s`\n')
            script_file_id.write(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`\n'
            )
            script_file_id.write('    calculate_duration\n')
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    echo "ERROR: $1 returned error $2"\n')
            script_file_id.write(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME+00:00 with a run duration of $DURATION s ($FORMATTED_DURATION)."\n'
            )
            script_file_id.write('    echo "$SEP"\n')
            script_file_id.write('    send_mail wrong\n')
            script_file_id.write('    touch $SCRIPT_STATUS_WRONG\n')
            script_file_id.write('    exit 3\n')
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            process_name = f'{xlib.get_htseq_count_name()} process'
            mail_message_ok = xlib.get_mail_message_ok(process_name,
                                                       cluster_name)
            mail_message_wrong = xlib.get_mail_message_wrong(
                process_name, cluster_name)
            script_file_id.write('function send_mail\n')
            script_file_id.write('{\n')
            script_file_id.write(
                f'    SUBJECT="{xlib.get_project_name()}: {process_name}"\n')
            script_file_id.write('    if [ "$1" == "ok" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_ok}"\n')
            script_file_id.write('    elif [ "$1" == "wrong" ]; then\n')
            script_file_id.write(f'        MESSAGE="{mail_message_wrong}"\n')
            script_file_id.write('    else\n')
            script_file_id.write('         MESSAGE=""\n')
            script_file_id.write('    fi\n')
            script_file_id.write(
                '    DESTINATION_FILE=mail-destination.json\n')
            script_file_id.write('    echo "{" > $DESTINATION_FILE\n')
            script_file_id.write(
                f'    echo "    \\\"ToAddresses\\\":  [\\\"{xconfiguration.get_contact_data()}\\\"]," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"CcAddresses\\\":  []," >> $DESTINATION_FILE\n'
            )
            script_file_id.write(
                '    echo "    \\\"BccAddresses\\\":  []" >> $DESTINATION_FILE\n'
            )
            script_file_id.write('    echo "}" >> $DESTINATION_FILE\n')
            script_file_id.write('    MESSAGE_FILE=mail-message.json\n')
            script_file_id.write('    echo "{" > $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Subject\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Data\\\":  \\\"$SUBJECT\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "        \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "    }," >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "    \\\"Body\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "        \\\"Html\\\": {" >> $MESSAGE_FILE\n')
            script_file_id.write(
                '    echo "            \\\"Data\\\":  \\\"$MESSAGE\\\"," >> $MESSAGE_FILE\n'
            )
            script_file_id.write(
                '    echo "            \\\"Charset\\\":  \\\"UTF-8\\\"" >> $MESSAGE_FILE\n'
            )
            script_file_id.write('    echo "        }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "    }" >> $MESSAGE_FILE\n')
            script_file_id.write('    echo "}" >> $MESSAGE_FILE\n')
            script_file_id.write(
                f'    aws ses send-email --from {xconfiguration.get_contact_data()} --destination file://$DESTINATION_FILE --message file://$MESSAGE_FILE\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('function calculate_duration\n')
            script_file_id.write('{\n')
            script_file_id.write(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`\n')
            script_file_id.write('    HH=`expr $DURATION / 3600`\n')
            script_file_id.write('    MM=`expr $DURATION % 3600 / 60`\n')
            script_file_id.write('    SS=`expr $DURATION % 60`\n')
            script_file_id.write(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`\n'
            )
            script_file_id.write('}\n')
            script_file_id.write(
                '#-------------------------------------------------------------------------------\n'
            )
            script_file_id.write('init\n')
            script_file_id.write('print_htseq_count_version\n')
            script_file_id.write('run_htseq_count_process\n')
            script_file_id.write('end\n')
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            f'*** ERROR: The file {get_htseq_count_process_script()} can not be created'
        )
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 30
0
def validate_busco_config_file(strict):
    '''
    Validate the BUSCO config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        busco_option_dict = xlib.get_option_dict(get_busco_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in busco_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = busco_option_dict.get('identification', {}).get('experiment_id', not_found)
            is_experiment_id_OK = True
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                is_experiment_id_OK = False
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = busco_option_dict.get('identification', {}).get('assembly_software', not_found)
            is_assembly_software_OK = True
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                is_assembly_software_OK = False
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                is_assembly_software_OK = False
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = busco_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            is_assembly_dataset_id_OK = True
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                is_assembly_dataset_id_OK = False
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                is_assembly_dataset_id_OK = False
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = busco_option_dict.get('identification', {}).get('assembly_type', not_found)
            is_assembly_type_OK = True
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                is_assembly_type_OK = False
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    is_assembly_type_OK = False
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    is_assembly_type_OK = False
                    OK = False

        # check section "BUSCO parameters"
        if 'BUSCO parameters' not in sections_list:
            error_list.append('*** ERROR: the section "BUSCO parameters" is not found.')
            OK = False
        else:

            # check section "BUSCO parameters" - key "ncpu"
            ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found)
            is_ncpu_OK = True
            if ncpu == not_found:
                error_list.append('*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".')
                is_ncpu_OK = False
                OK = False
            else:
                try:
                    if int(ncpu) < 1:
                        error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                        is_ncpu_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                    is_ncpu_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "lineage_data"
            lineage_data = busco_option_dict.get('BUSCO parameters', {}).get('lineage_data', not_found)
            is_lineage_data_OK = True
            if lineage_data == not_found:
                error_list.append('*** ERROR: the key "lineage_data" is not found in the section "BUSCO parameters"')
                is_lineage_data_OK = False
                OK = False

            # check section "BUSCO parameters" - key "mode"
            mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found).lower()
            is_mode_OK = True
            if mode == not_found:
                error_list.append('*** ERROR: the key "mode" is not found in the section "BUSCO parameters".')
                is_mode_OK = False
                OK = False
            elif mode not in ['geno', 'tran', 'prot']:
                error_list.append('*** ERROR: the key "mode" value in the section "BUSCO parameters" must be geno or tran or prot.')
                is_mode_OK = False
                OK = False

            # check section "BUSCO parameters" - key "evalue"
            evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found)
            is_evalue_OK = True
            if evalue == not_found:
                error_list.append('*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".')
                is_evalue_OK = False
                OK = False
            else:
                try:
                    if float(evalue) <= 0:
                        error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.')
                        is_evalue_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.')
                    is_evalue_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "limit"
            limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found)
            is_limit_OK = True
            if limit == not_found:
                error_list.append('*** ERROR: the key "limit" is not found in the section "BUSCO parameters".')
                OK = False
            else:
                try:
                    if int(limit) < 1:
                        error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                        is_limit_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                    is_limit_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "species"
            species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found)
            is_species_OK = True
            if species == not_found:
                error_list.append('*** ERROR: the key "species" is not found in the section "BUSCO parameters"')
                is_species_OK = False
                OK = False

            # check section "BUSCO parameters" - key "long"
            long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found).upper()
            is_long_OK = True
            if long == not_found:
                error_list.append('*** ERROR: the key "long" is not found in the section "BUSCO parameters".')
                is_long_OK = False
                OK = False
            elif long not in ['YES', 'NO']:
                error_list.append('*** ERROR: the key "long" value in the section "BUSCO parameters" must be YES or NO.')
                is_long_OK = False
                OK = False

            # check section "BUSCO parameters" - key "augustus_options"
            augustus_options = busco_option_dict.get('BUSCO parameters', {}).get('augustus_options', not_found)
            is_augustus_options_OK = True
            if augustus_options == not_found:
                error_list.append('*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".')
                is_augustus_options_OK = False
                OK = False
            else:
                if augustus_options.upper() != 'NONE':
                    parameter_list = [x.strip() for x in augustus_options.split(';')]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append('*** ERROR: the value of the key "augustus_options" in the section "BUSCO parameters" must be NONE or a valid August parameter list.')
                            is_augustus_options_OK = False
                            OK = False
                            break

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_busco_name()))

    # return the control variable and the error list
    return (OK, error_list)