Esempio n. 1
0
def create_gmap_config_file(experiment_id='exp001', reference_dataset_id='NONE', reference_file='NONE', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'):
    '''
    Create GMAP config file with the default options. It is necessary
    update the options in each run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # set the app
    if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
        assembly_software = xlib.get_soapdenovotrans_code()
    elif assembly_dataset_id.startswith(xlib.get_transabyss_code()):
        assembly_software = xlib.get_transabyss_code()
    elif assembly_dataset_id.startswith(xlib.get_trinity_code()):
        assembly_software = xlib.get_trinity_code()
    elif assembly_dataset_id.startswith(xlib.get_star_code()):
        assembly_software = xlib.get_star_code()
    elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()):
        assembly_software = xlib.get_cd_hit_est_code()
    elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
        assembly_software = xlib.get_transcript_filter_code()

    # create the GMAP config file and write the default options
    try:
        if not os.path.exists(os.path.dirname(get_gmap_config_file())):
            os.makedirs(os.path.dirname(get_gmap_config_file()))
        with open(get_gmap_config_file(), mode='w', encoding='utf8') as file_id:
            file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir())))
            file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir())))
            file_id.write('{0}\n'.format('# The experiment_id, reference_dataset_id, reference_file and assembly_dataset_id are fixed in the identification section.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# You can consult the parameters of GMAP and their meaning in http://research-pub.gene.com/gmap/.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# In section "GMAP parameters", the key "other_parameters" allows you to input additional parameters in the format:'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('#    other_parameters = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# parameter-i is a parameter name of GMAP and value-i a valid value of parameter-i, e.g.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('#    other_parameters = --no-chimeras; --canonical-mode=2'))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information identifies the experiment.'))
            file_id.write('{0}\n'.format('[identification]'))
            file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification'))
            file_id.write('{0:<50} {1}\n'.format('reference_dataset_id = {0}'.format(reference_dataset_id), '# reference dataset identification or NONE'))
            file_id.write('{0:<50} {1}\n'.format('reference_file = {0}'.format(reference_file), '# reference file name or NONE'))
            file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name())))
            file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification'))
            file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(),  xlib.get_transabyss_name(),  xlib.get_trinity_name(),  xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information to set the GMAP parameters'))
            file_id.write('{0}\n'.format('[GMAP parameters]'))
            file_id.write('{0:<50} {1}\n'.format('threads = 2', '# number of threads for use'))
            file_id.write('{0:<50} {1}\n'.format('kmer = NONE', '# kmer size to use in genome database or NONE (the program will find the highest available kmer size in the genome database)'))
            file_id.write('{0:<50} {1}\n'.format('sampling = NONE', '# Sampling to use in genome database or NONE (the program will find the smallest available sampling value in the genome database within selected k-mer size)'))
            file_id.write('{0:<50} {1}\n'.format('input-buffer-size = 1000', '# size of input buffer'))
            file_id.write('{0:<50} {1}\n'.format('output-buffer-size = 1000', '# size of buffer size in queries for output thread'))
            file_id.write('{0:<50} {1}\n'.format('prunelevel = 0', '# pruning level: 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive)'))
            file_id.write('{0:<50} {1}\n'.format('format = COMPRESS', '# format for output: COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS'))
            file_id.write('{0:<50} {1}\n'.format('other_parameters = NONE', '# additional parameters to the previous ones or NONE'))
    except:
        error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_gmap_config_file()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 2
0
def build_cd_hit_est_process_script(cluster_name, current_run_dir):
    '''
    Build the current CD-HIT-EST process script.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the option dictionary
    cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file())

    # get the options
    experiment_id = cd_hit_est_option_dict['identification']['experiment_id']
    assembly_software = cd_hit_est_option_dict['identification'][
        'assembly_software']
    assembly_dataset_id = cd_hit_est_option_dict['identification'][
        'assembly_dataset_id']
    assembly_type = cd_hit_est_option_dict['identification']['assembly_type']
    threads = cd_hit_est_option_dict['CD-HIT-EST parameters']['threads']
    memory_limit = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'memory_limit']
    seq_identity_threshold = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'seq_identity_threshold']
    word_length = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'word_length']
    mask = cd_hit_est_option_dict['CD-HIT-EST parameters']['mask']
    match = cd_hit_est_option_dict['CD-HIT-EST parameters']['match']
    mismatch = cd_hit_est_option_dict['CD-HIT-EST parameters']['mismatch']
    other_parameters = cd_hit_est_option_dict['CD-HIT-EST parameters'][
        'other_parameters']

    # set the transcriptome file path
    if assembly_software == xlib.get_soapdenovotrans_code():
        if assembly_type == 'CONTIGS':
            transcriptome_file = '{0}/{1}-{2}.contig'.format(
                xlib.get_cluster_experiment_result_dataset_dir(
                    experiment_id, assembly_dataset_id), experiment_id,
                assembly_dataset_id)
        elif assembly_type == 'SCAFFOLDS':
            transcriptome_file = '{0}/{1}-{2}.scafSeq'.format(
                xlib.get_cluster_experiment_result_dataset_dir(
                    experiment_id, assembly_dataset_id), experiment_id,
                assembly_dataset_id)
    elif assembly_software == xlib.get_transabyss_code():
        transcriptome_file = '{0}/transabyss-final.fa'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_trinity_code():
        transcriptome_file = '{0}/Trinity.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_star_code():
        transcriptome_file = '{0}/Trinity-GG.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_cd_hit_est_code():
        transcriptome_file = '{0}/clustered-transcriptome.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))
    elif assembly_software == xlib.get_transcript_filter_code():
        transcriptome_file = '{0}/filtered-transcriptome.fasta'.format(
            xlib.get_cluster_experiment_result_dataset_dir(
                experiment_id, assembly_dataset_id))

    # set the output file path
    if OK:
        output_file = '{0}/clustered-transcriptome.fasta'.format(
            current_run_dir)

    # write the CD-HIT-EST process script
    try:
        if not os.path.exists(os.path.dirname(
                get_cd_hit_est_process_script())):
            os.makedirs(os.path.dirname(get_cd_hit_est_process_script()))
        with open(get_cd_hit_est_process_script(),
                  mode='w',
                  encoding='utf8',
                  newline='\n') as file_id:
            file_id.write('{0}\n'.format('#!/bin/bash'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format(
                'CDHIT_PATH={0}/{1}/envs/{2}/bin'.format(
                    xlib.get_cluster_app_dir(), xlib.get_miniconda3_name(),
                    xlib.get_cd_hit_bioconda_code())))
            file_id.write('{0}\n'.format('PATH=$CDHIT_PATH:$PATH'))
            file_id.write('{0}\n'.format(
                'SEP="#########################################"'))
            file_id.write('{0}\n'.format('cd {0}/{1}/bin'.format(
                xlib.get_cluster_app_dir(), xlib.get_miniconda3_name())))
            file_id.write('{0}\n'.format('source activate {0}'.format(
                xlib.get_cd_hit_bioconda_code())))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function init'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    INIT_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_INIT_DATETIME=`date --date="@$INIT_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script started in node $HOSTNAME of cluster {0} at $FORMATTED_INIT_DATETIME UTC."'
                .format(cluster_name)))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function run_cd_hit_est_process'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    cd {0}'.format(current_run_dir)))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Running {0} process ..."'.format(
                    xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format('    /usr/bin/time \\'))
            file_id.write('{0}\n'.format(
                '        --format="$SEP\\nElapsed real time (s): %e\\nCPU time in kernel mode (s): %S\\nCPU time in user mode (s): %U\\nPercentage of CPU: %P\\nMaximum resident set size(Kb): %M\\nAverage total memory use (Kb):%K" \\'
            ))
            file_id.write('{0}\n'.format('        cd-hit-est \\'))
            file_id.write('{0}\n'.format(
                '            -T {0} \\'.format(threads)))
            file_id.write('{0}\n'.format(
                '            -M {0} \\'.format(memory_limit)))
            file_id.write('{0}\n'.format(
                '            -i {0} \\'.format(transcriptome_file)))
            file_id.write('{0}\n'.format(
                '            -c {0} \\'.format(seq_identity_threshold)))
            file_id.write('{0}\n'.format(
                '            -n {0} \\'.format(word_length)))
            file_id.write('{0}\n'.format(
                '            -mask {0} \\'.format(mask)))
            file_id.write('{0}\n'.format(
                '            -match {0} \\'.format(match)))
            file_id.write('{0}\n'.format(
                '            -mismatch {0} \\'.format(mismatch)))
            if other_parameters.upper() == 'NONE':
                file_id.write('{0}\n'.format(
                    '            -o {0}'.format(output_file)))
            else:
                file_id.write('{0}\n'.format(
                    '            -o {0} \\'.format(output_file)))
                parameter_list = [
                    x.strip() for x in other_parameters.split(';')
                ]
                for i in range(len(parameter_list)):
                    if parameter_list[i].find('=') > 0:
                        pattern = r'^--(.+)=(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        parameter_value = mo.group(2).strip()
                        if i < len(parameter_list) - 1:
                            file_id.write('{0}\n'.format(
                                '            -{0} {1} \\'.format(
                                    parameter_name, parameter_value)))
                        else:
                            file_id.write('{0}\n'.format(
                                '            -{0} {1}'.format(
                                    parameter_name, parameter_value)))
                    else:
                        pattern = r'^--(.+)$'
                        mo = re.search(pattern, parameter_list[i])
                        parameter_name = mo.group(1).strip()
                        if i < len(parameter_list):
                            file_id.write('{0}\n'.format(
                                '            -{0} \\'.format(parameter_name)))
                        else:
                            file_id.write('{0}\n'.format(
                                '            -{0}'.format(parameter_name)))
                    i += 1
            file_id.write('{0}\n'.format('    RC=$?'))
            file_id.write('{0}\n'.format(
                '    if [ $RC -ne 0 ]; then manage_error cd-hit-est $RC; fi'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function end'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended OK at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended OK at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_rsem_eval_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 0'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function manage_error'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format('    END_DATETIME=`date --utc +%s`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_END_DATETIME=`date --date="@$END_DATETIME" "+%Y-%m-%d %H:%M:%S"`'
            ))
            file_id.write('{0}\n'.format('    calculate_duration'))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write(
                '{0}\n'.format('    echo "ERROR: $1 returned error $2"'))
            file_id.write('{0}\n'.format(
                '    echo "Script ended WRONG at $FORMATTED_END_DATETIME UTC with a run duration of $DURATION s ($FORMATTED_DURATION)."'
            ))
            file_id.write('{0}\n'.format('    echo "$SEP"'))
            file_id.write('{0}\n'.format('    RECIPIENT={0}'.format(
                xconfiguration.get_contact_data())))
            file_id.write('{0}\n'.format(
                '    SUBJECT="{0}: {1} process"'.format(
                    xlib.get_project_name(), xlib.get_cd_hit_est_name())))
            file_id.write('{0}\n'.format(
                '    MESSAGE="The {0} process in node $HOSTNAME of cluster {0} ended WRONG at $FORMATTED_END_DATETIME with a run duration of $DURATION s ($FORMATTED_DURATION). Please review its log.<br/><br/>Regards,<br/>GI Genetica, Fisiologia e Historia Forestal<br/>Dpto. Sistemas y Recursos Naturales<br/>ETSI Montes, Forestal y del Medio Natural<br/>Universidad Politecnica de Madrid<br/>https://github.com/ggfhf/"'
                .format(xlib.get_rsem_eval_name(), cluster_name)))
            file_id.write('{0}\n'.format(
                '    mail --append "Content-type: text/html;"  --subject "$SUBJECT" "$RECIPIENT" <<< "$MESSAGE"'
            ))
            file_id.write('{0}\n'.format('    exit 3'))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('function calculate_duration'))
            file_id.write('{0}\n'.format('{'))
            file_id.write('{0}\n'.format(
                '    DURATION=`expr $END_DATETIME - $INIT_DATETIME`'))
            file_id.write('{0}\n'.format('    HH=`expr $DURATION / 3600`'))
            file_id.write(
                '{0}\n'.format('    MM=`expr $DURATION % 3600 / 60`'))
            file_id.write('{0}\n'.format('    SS=`expr $DURATION % 60`'))
            file_id.write('{0}\n'.format(
                '    FORMATTED_DURATION=`printf "%03d:%02d:%02d\\n" $HH $MM $SS`'
            ))
            file_id.write('{0}\n'.format('}'))
            file_id.write('{0}\n'.format(
                '#-------------------------------------------------------------------------------'
            ))
            file_id.write('{0}\n'.format('init'))
            file_id.write('{0}\n'.format('run_cd_hit_est_process'))
            file_id.write('{0}\n'.format('end'))
    except:
        error_list.append('*** ERROR: The file {0} can not be created'.format(
            get_cd_hit_est_process_script()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 3
0
def validate_gmap_config_file(strict):
    '''
    Validate the GMAP config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        gmap_option_dict = xlib.get_option_dict(get_gmap_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in gmap_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = gmap_option_dict.get('identification', {}).get('experiment_id', not_found)
            is_experiment_id_OK = True
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                is_experiment_id_OK = False
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = gmap_option_dict.get('identification', {}).get('reference_dataset_id', not_found)
            is_reference_dataset_id_OK = True
            if reference_dataset_id == not_found:
                error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".')
                is_reference_dataset_id_OK = False
                OK = False

            # check section "identification" - key "reference_file"
            reference_file = gmap_option_dict.get('identification', {}).get('reference_file', not_found)
            is_reference_file_OK = True
            if reference_file == not_found:
                error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".')
                is_reference_file_OK = False
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = gmap_option_dict.get('identification', {}).get('assembly_software', not_found)
            is_assembly_software_OK = True
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                is_assembly_software_OK = False
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                is_assembly_software_OK = False
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = gmap_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            is_assembly_dataset_id_OK = True
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                is_assembly_dataset_id_OK = False
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                is_assembly_dataset_id_OK = False
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = gmap_option_dict.get('identification', {}).get('assembly_type', not_found)
            is_assembly_type_OK = True
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                is_assembly_type_OK = False
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    is_assembly_type_OK = False
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    is_assembly_type_OK = False
                    OK = False

        # check section "GMAP parameters"
        if 'GMAP parameters' not in sections_list:
            error_list.append('*** ERROR: the section "GMAP parameters" is not found.')
            OK = False
        else:

            # check section "GMAP parameters" - key "threads"
            threads = gmap_option_dict.get('GMAP parameters', {}).get('threads', not_found)
            is_threads_OK = True
            if threads == not_found:
                error_list.append('*** ERROR: the key "threads" is not found in the section "GMAP parameters".')
                is_threads_OK = False
                OK = False
            else:
                try:
                    if int(threads) < 1:
                        error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_threads_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "threads" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_threads_OK = False
                    OK = False

            # check section "GMAP parameters" - key "kmer"
            kmer = gmap_option_dict.get('GMAP parameters', {}).get('kmer', not_found)
            is_kmer_OK = True
            if kmer == not_found:
                error_list.append('*** ERROR: the key "kmer" is not found in the section "GMAP parameters".')
                is_kmer_OK = False
                OK = False
            else:
                try:
                    if kmer.upper() != 'NONE' and (int(kmer) < 1 or int(kmer) > 16):
                        error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.')
                        is_kmer_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "kmer" in the section "GMAP parameters" must be an integer value between 1 and 16 or NONE.')
                    is_kmer_OK = False
                    OK = False

            # check section "GMAP parameters" - key "sampling"
            sampling = gmap_option_dict.get('GMAP parameters', {}).get('sampling', not_found)
            is_sampling_OK = True
            if sampling == not_found:
                error_list.append('*** ERROR: the key "sampling" is not found in the section "GMAP parameters".')
                is_sampling_OK = False
                OK = False
            else:
                try:
                    if sampling.upper() != 'NONE' and int(sampling) < 1:
                        error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.')
                        is_sampling_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "sampling" in the section "GMAP parameters" must be an integer value greater or equal to 1 or NONE.')
                    is_sampling_OK = False
                    OK = False

            # check section "GMAP parameters" - key "input-buffer-size"
            input_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('input-buffer-size', not_found)
            is_input_buffer_size_OK = True
            if input_buffer_size == not_found:
                error_list.append('*** ERROR: the key "input-buffer-size" is not found in the section "GMAP parameters".')
                is_input_buffer_size_OK = False
                OK = False
            else:
                try:
                    if int(input_buffer_size) < 1:
                        error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_input_buffer_size_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "input-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_input_buffer_size_OK = False
                    OK = False

            # check section "GMAP parameters" - key "output-buffer-size"
            output_buffer_size = gmap_option_dict.get('GMAP parameters', {}).get('output-buffer-size', not_found)
            is_output_buffer_size_OK = True
            if output_buffer_size == not_found:
                error_list.append('*** ERROR: the key "output-buffer-size" is not found in the section "GMAP parameters".')
                is_output_buffer_size_OK = False
                OK = False
            else:
                try:
                    if int(output_buffer_size) < 1:
                        error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                        is_output_buffer_size_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "output-buffer-size" in the section "GMAP parameters" must be an integer value greater or equal to 1.')
                    is_output_buffer_size_OK = False
                    OK = False

            # check section "GMAP parameters" - key "prunelevel"
            prunelevel = gmap_option_dict.get('GMAP parameters', {}).get('prunelevel', not_found)
            is_prunelevel_OK = True
            if prunelevel == not_found:
                error_list.append('*** ERROR: the key "prunelevel" is not found in the section "GMAP parameters".')
                is_prunelevel_OK = False
                OK = False
            else:
                if prunelevel not in ['0', '1', '2', '3']:
                    error_list.append('*** ERROR: the key "prunelevel" in the section "GMAP parameters" must be 0 (no pruning) or 1 (poor seqs) or 2 (repetitive seqs) or 3 (poor and repetitive).')
                    is_prunelevel_OK = False
                    OK = False

            # check section "GMAP parameters" - key "format"
            format = gmap_option_dict.get('GMAP parameters', {}).get('format', not_found)
            is_format_OK = True
            if format == not_found:
                error_list.append('*** ERROR: the key "format" is not found in the section "GMAP parameters".')
                is_format_OK = False
                OK = False
            else:
                if format.upper() not in ['COMPRESS', 'SUMMARY', 'ALIGN', 'PLS', 'GFF3_GENE', 'SPLICESITES', 'INTRONS', 'MAP_EXONS', 'MAP_RANGES', 'COORDS']:
                    error_list.append('*** ERROR: the key "format" in the section "GMAP parameters" must be COMPRESS or SUMMARY or ALIGN or PLS or GFF3_GENE or SPLICESITES or INTRONS or MAP_EXONS or MAP_RANGES or COORDS.')
                    is_format_OK = False
                    OK = False

            # check section "GMAP parameters" - key "other_parameters"
            not_allowed_parameters_list = ['nthreads', 'kmer', 'sampling', 'input-buffer-size', 'output-buffer-size', 'prunelevel', 'compress', 'summary', 'align', 'format' ]
            other_parameters = gmap_option_dict.get('GMAP parameters', {}).get('other_parameters', not_found)
            is_other_parameters_OK = True
            if other_parameters == not_found:
                error_list.append('*** ERROR: the key "other_parameters" is not found in the section "GMAP parameters".')
                is_other_parameters_OK = False
                OK = False
            else:
                if other_parameters.upper() != 'NONE':
                    parameter_list = [x.strip() for x in other_parameters.split(';')]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append('*** ERROR: the value of the key "other_parameters" in the section "GMAP parameters" must be NONE or a valid parameter list.')
                            is_other_parameters_OK = False
                            OK = False
                            break
                        else:
                            if parameter_name in not_allowed_parameters_list:
                                error_list.append('*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "GMAP parameters" because it is controled by NGScloud.'.format(parameter_name))
                                is_other_parameters_OK = False
                                OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_gmap_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 4
0
def create_cd_hit_est_config_file(experiment_id='exp001',
                                  assembly_dataset_id='sdnt-170101-235959',
                                  assembly_type='CONTIGS'):
    '''
    Create CD-HIT-EST config file with the default options. It is necessary
    update the options in each run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # set the assembly software
    if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
        assembly_software = xlib.get_soapdenovotrans_code()
    elif assembly_dataset_id.startswith(xlib.get_transabyss_code()):
        assembly_software = xlib.get_transabyss_code()
    elif assembly_dataset_id.startswith(xlib.get_trinity_code()):
        assembly_software = xlib.get_trinity_code()
    elif assembly_dataset_id.startswith(xlib.get_star_code()):
        assembly_software = xlib.get_star_code()
    elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()):
        assembly_software = xlib.get_cd_hit_est_code()
    elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
        assembly_software = xlib.get_transcript_filter_code()

    # create the CD-HIT-EST config file and write the default options
    try:
        if not os.path.exists(os.path.dirname(get_cd_hit_est_config_file())):
            os.makedirs(os.path.dirname(get_cd_hit_est_config_file()))
        with open(get_cd_hit_est_config_file(), mode='w',
                  encoding='utf8') as file_id:
            file_id.write('{0}\n'.format(
                '# You must review the information of this file and update the values with the corresponding ones to the current run.'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format(
                '# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'
                .format(xlib.get_cluster_result_dir())))
            file_id.write('{0}\n'.format(
                '# The experiment_id and assembly_dataset_id names are fixed in the identification section.'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format(
                '# You can consult the parameters of CD-HIT-EST (CD-HIT package) and their meaning in http://weizhong-lab.ucsd.edu/cd-hit/.'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format(
                '# In section "CD-HIT-EST parameters", the key "other_parameters" allows you to input additional parameters in the format:'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format(
                '#    other_parameters = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format(
                '# parameter-i is a parameter name of CD-HIT-EST and value-i a valid value of parameter-i, e.g.'
            ))
            file_id.write('{0}\n'.format('#'))
            file_id.write(
                '{0}\n'.format('#    other_parameters = --aS=0.9; --U=10'))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format(
                '# This section has the information identifies the assembly result dataset.'
            ))
            file_id.write('{0}\n'.format('[identification]'))
            file_id.write('{0:<50} {1}\n'.format(
                'experiment_id = {0}'.format(experiment_id),
                '# experiment identification'))
            file_id.write('{0:<50} {1}\n'.format(
                'assembly_software = {0}'.format(assembly_software),
                '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'
                .format(xlib.get_soapdenovotrans_code(),
                        xlib.get_soapdenovotrans_name(),
                        xlib.get_transabyss_code(), xlib.get_transabyss_name(),
                        xlib.get_trinity_code(), xlib.get_trinity_name(),
                        xlib.get_star_code(), xlib.get_star_name(),
                        xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(),
                        xlib.get_transcript_filter_code(),
                        xlib.get_transcript_filter_name())))
            file_id.write('{0:<50} {1}\n'.format(
                'assembly_dataset_id = {0}'.format(assembly_dataset_id),
                '# assembly dataset identification'))
            file_id.write('{0:<50} {1}\n'.format(
                'assembly_type = {0}'.format(assembly_type),
                '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'
                .format(xlib.get_soapdenovotrans_name(),
                        xlib.get_transabyss_name(), xlib.get_trinity_name(),
                        xlib.get_star_name(), xlib.get_cd_hit_est_name(),
                        xlib.get_transcript_filter_name())))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format(
                '# This section has the information to set the CD-HIT-EST parameters'
            ))
            file_id.write('{0}\n'.format('[CD-HIT-EST parameters]'))
            file_id.write('{0:<50} {1}\n'.format(
                'threads = 2',
                '# number of threads for use; with 0, all CPUs will be used'))
            file_id.write('{0:<50} {1}\n'.format(
                'memory_limit = 800',
                '# memory limit (in MB) for the program; 0 for unlimitted'))
            file_id.write('{0:<50} {1}\n'.format(
                'seq_identity_threshold = 0.9',
                '# sequence identity threshold'))
            file_id.write('{0:<50} {1}\n'.format('word_length = 5',
                                                 '# word length'))
            file_id.write('{0:<50} {1}\n'.format(
                'mask = NX',
                '# masking letters (e.g. -mask NX, to mask out both "N" and "X")'
            ))
            file_id.write('{0:<50} {1}\n'.format(
                'match = 2', '# matching score (1 for T-U and N-N)'))
            file_id.write('{0:<50} {1}\n'.format('mismatch = -2',
                                                 '# mismatching score'))
            file_id.write('{0:<50} {1}\n'.format(
                'other_parameters = NONE',
                '# additional parameters to the previous ones or NONE'))
    except:
        error_list.append(
            '*** ERROR: The file {0} can not be recreated'.format(
                get_cd_hit_est_config_file()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 5
0
def validate_cd_hit_est_config_file(strict):
    '''
    Validate the CD-HIT-EST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        cd_hit_est_option_dict = xlib.get_option_dict(
            get_cd_hit_est_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in cd_hit_est_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = cd_hit_est_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif assembly_software not in [
                    xlib.get_soapdenovotrans_code(),
                    xlib.get_transabyss_code(),
                    xlib.get_trinity_code(),
                    xlib.get_star_code(),
                    xlib.get_cd_hit_est_code(),
                    xlib.get_transcript_filter_code()
            ]:
                error_list.append(
                    '*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'
                    .format(xlib.get_soapdenovotrans_code(),
                            xlib.get_transabyss_code(),
                            xlib.get_trinity_code(), xlib.get_star_code(),
                            xlib.get_cd_hit_est_code(),
                            xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not assembly_dataset_id.startswith(
                    xlib.get_soapdenovotrans_code()
            ) and not assembly_dataset_id.startswith(xlib.get_transabyss_code(
            )) and not assembly_dataset_id.startswith(xlib.get_trinity_code(
            )) and not assembly_dataset_id.startswith(xlib.get_star_code(
            )) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code(
            )) and not assembly_dataset_id.startswith(
                    xlib.get_transcript_filter_code()):
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'
                    .format(xlib.get_soapdenovotrans_name(),
                            xlib.get_transabyss_name(),
                            xlib.get_trinity_name(), xlib.get_star_name(),
                            xlib.get_cd_hit_est_name(),
                            xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(
                    xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append(
                        '*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'
                        .format(xlib.get_soapdenovotrans_name()))
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code(
            )) or assembly_dataset_id.startswith(xlib.get_trinity_code(
            )) or assembly_dataset_id.startswith(
                    xlib.get_star_code()) or assembly_dataset_id.startswith(
                        xlib.get_cd_hit_est_code(
                        )) or assembly_dataset_id.startswith(
                            xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append(
                        '*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'
                        .format(xlib.get_transabyss_name(),
                                xlib.get_trinity_name(), xlib.get_star_name(),
                                xlib.get_cd_hit_est_name(),
                                xlib.get_transcript_filter_name()))
                    OK = False

        # check section "CD-HIT-EST parameters"
        if 'CD-HIT-EST parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "CD-HIT-EST parameters" is not found.')
            OK = False
        else:

            # check section "CD-HIT-EST parameters" - key "threads"
            threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                 {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(threads) < 0:
                        error_list.append(
                            '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "threads" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "memory_limit"
            memory_limit = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('memory_limit', not_found)
            if memory_limit == not_found:
                error_list.append(
                    '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(memory_limit) < 0:
                        error_list.append(
                            '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "memory_limit" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "seq_identity_threshold"
            seq_identity_threshold = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('seq_identity_threshold',
                                                 not_found)
            if seq_identity_threshold == not_found:
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if float(seq_identity_threshold) < 0.0 or float(
                            seq_identity_threshold) > 1.0:
                        error_list.append(
                            '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "seq_identity_threshold" in the section "CD-HIT-EST parameters" must be a float value between 0.0 and 1.0.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "word_length"
            word_length = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('word_length', not_found)
            if word_length == not_found:
                error_list.append(
                    '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    if int(word_length) < 1:
                        error_list.append(
                            '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.'
                        )
                        OK = False
                except:
                    error_list.append(
                        '*** ERROR: the key "word_length" in the section "CD-HIT-EST parameters" must be an integer value greater or equal to 1.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "mask"
            mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                              {}).get('mask',
                                                      not_found).upper()
            if mask == not_found:
                error_list.append(
                    '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "match"
            match = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                               {}).get('match', not_found)
            if match == not_found:
                error_list.append(
                    '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    int(match)
                except:
                    error_list.append(
                        '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "mismatch"
            mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                  {}).get(
                                                      'mismatch', not_found)
            if mismatch == not_found:
                error_list.append(
                    '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                try:
                    int(mismatch)
                except:
                    error_list.append(
                        '*** ERROR: the key "match" in the section "CD-HIT-EST parameters" must be an integer value.'
                    )
                    OK = False

            # check section "CD-HIT-EST parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch'
            ]
            other_parameters = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            else:
                if other_parameters.upper() != 'NONE':
                    parameter_list = [
                        x.strip() for x in other_parameters.split(';')
                    ]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append(
                                '*** ERROR: the value of the key "other_parameters" in the section "CD-HIT-EST parameters" must be NONE or a valid parameter list.'
                            )
                            OK = False
                            break
                        if parameter_name in not_allowed_parameters_list:
                            error_list.append(
                                '*** ERROR: the parameter {0} is not allowed in the key "other_parameters" of the section "CD-HIT-EST parameters" because it is controled by {1}.'
                                .format(parameter_name,
                                        xlib.get_project_name()))
                            OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            '\nThe {0} config file is not valid. Please, correct this file or recreate it.'
            .format(xlib.get_cd_hit_est_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 6
0
def get_result_dataset_dict(cluster_name, experiment_id, status, passed_connection, ssh_client):
    '''
    Get a dictionary with the result datasets of an experiment in the cluster.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # get the result directory in the cluster
    cluster_result_dir = xlib.get_cluster_result_dir()

    # initialize the dictionary of the result datasets
    result_dataset_dict = {}

    # create the SSH client connection
    if not passed_connection:
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master')

    # verify the result directory is created
    if OK:
        command = '[ -d {0} ] && echo RC=0 || echo RC=1'.format(cluster_result_dir)
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if stdout[len(stdout) - 1] != 'RC=0':
            error_list.append('*** ERROR: There is not any volume mounted in the result directory.\n')
            error_list.append('You must link a volume in the mounting point {0} for the template {1}.\n'.format(cluster_result_dir, cluster_name))
            OK = False

    # get the dictionary of the result datasets
    if OK:
        if status == 'uncompressed':
            command = 'cd  {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(cluster_result_dir, experiment_id)
        elif status == 'compressed':
            command = 'cd {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^d > /dev/null && echo $list; done;'.format(cluster_result_dir, experiment_id)
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            if status == 'uncompressed':
                input_pattern = '{0}-(.+)-(.+)'
                output_pattern = '{0} ({1} {2})'
            elif status == 'compressed':
                input_pattern = '{0}-(.+)-(.+).tar.gz'
                output_pattern = '{0} ({1} {2}) [compressed]'
            for line in stdout:
                line = line.rstrip('\n')
                if line != 'lost+found':
                    result_dataset_id = line
                    if result_dataset_id.startswith(xlib.get_cd_hit_est_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_cd_hit_est_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_cd_hit_est_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_fastqc_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_fastqc_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_gzip_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_gzip_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_insilico_read_normalization_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_insilico_read_normalization_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_quast_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_quast_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_quast_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_ref_eval_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_ref_eval_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_rnaquast_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_rnaquast_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_rsem_eval_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_rsem_eval_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_soapdenovotrans_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_soapdenovotrans_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_star_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_star_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_star_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_transabyss_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_transabyss_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_transcript_filter_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_transcript_filter_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_transcriptome_blastx_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_transcriptome_blastx_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_transrate_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_transrate_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_trimmomatic_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_trimmomatic_name(), date, time)
                    elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'):
                        mo = re.match(input_pattern.format(xlib.get_trinity_code()), result_dataset_id)
                        date = mo.group(1)
                        time = mo.group(2)
                        result_dataset_name = output_pattern.format(xlib.get_trinity_name(), date, time)
                    else:
                        result_dataset_name = result_dataset_id
                    result_dataset_dict[result_dataset_id] = {'result_dataset_id': result_dataset_id, 'result_dataset_name': result_dataset_name}

    # close the SSH client connection
    if OK and not passed_connection:
        xssh.close_ssh_client_connection(ssh_client)

    # return the control variable, error list and dictionary of the result datasets
    return (OK, error_list, result_dataset_dict)
Esempio n. 7
0
def run_cd_hit_est_process(cluster_name, log, function=None):
    '''
    Run a CD-HIT-EST process.
    '''

    # initialize the control variable
    OK = True

    # get the CD-HIT-EST option dictionary
    cd_hit_est_option_dict = xlib.get_option_dict(get_cd_hit_est_config_file())

    # get the experiment identification
    experiment_id = cd_hit_est_option_dict['identification']['experiment_id']

    # warn that the log window must not be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write(
            'This process might take several minutes. Do not close this window, please wait!\n'
        )

    # validate the CD-HIT-EST config file
    log.write('{0}\n'.format(xlib.get_separator()))
    log.write('Validating the {0} config file ...\n'.format(
        xlib.get_cd_hit_est_name()))
    (OK, error_list) = validate_cd_hit_est_config_file(strict=True)
    if OK:
        log.write('The config file is OK.\n')
    else:
        log.write('*** ERROR: The config file is not valid.\n')
        log.write('Please correct this file or recreate the config files.\n')

    # create the SSH client connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SSH client ...\n')
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(
            cluster_name, 'master')
        if OK:
            log.write('The SSH client is connected.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # create the SSH transport connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SSH transport ...\n')
        (OK, error_list, ssh_transport) = xssh.create_ssh_transport_connection(
            cluster_name, 'master')
        if OK:
            log.write('The SSH transport is connected.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # create the SFTP client
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Connecting the SFTP client ...\n')
        sftp_client = xssh.create_sftp_client(ssh_transport)
        log.write('The SFTP client is connected.\n')

    # warn that the requirements are being verified
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Verifying process requirements ...\n')

    # verify the master is running
    if OK:
        (master_state_code,
         master_state_name) = xec2.get_node_state(cluster_name, 'master')
        if master_state_code != 16:
            log.write(
                '*** ERROR: The cluster {0} is not running. Its state is {1} ({2}).\n'
                .format(cluster_name, master_state_code, master_state_name))
            OK = False

    # verify the CD-HIT is set up
    if OK:
        (OK, error_list, is_setup) = xbioinfoapp.is_setup_bioconda_package(
            xlib.get_cd_hit_bioconda_code(), cluster_name, True, ssh_client)
        if OK:
            if not is_setup:
                log.write('*** ERROR: {0} is not setup.\n'.format(
                    xlib.get_cd_hit_name()))
                OK = False
        else:
            log.write(
                '*** ERROR: The verification of {0} setup could not be performed.\n'
                .format(xlib.get_cd_hit_name()))

    # warn that the requirements are OK
    if OK:
        log.write('Process requirements are OK.\n')

    # determine the run directory in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Determining the run directory in the cluster ...\n')
        current_run_dir = xlib.get_cluster_current_run_dir(
            experiment_id, xlib.get_cd_hit_est_code())
        command = 'mkdir --parents {0}'.format(current_run_dir)
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The directory path is {0}.\n'.format(current_run_dir))
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # build the CD-HIT-EST process script
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Building the process script {0} ...\n'.format(
            get_cd_hit_est_process_script()))
        (OK,
         error_list) = build_cd_hit_est_process_script(cluster_name,
                                                       current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('*** ERROR: The file could not be built.\n')

    # upload the CD-HIT-EST process script to the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write(
            'Uploading the process script {0} to the directory {1} of the master ...\n'
            .format(get_cd_hit_est_process_script(), current_run_dir))
        cluster_path = '{0}/{1}'.format(
            current_run_dir, os.path.basename(get_cd_hit_est_process_script()))
        (OK, error_list) = xssh.put_file(sftp_client,
                                         get_cd_hit_est_process_script(),
                                         cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # set run permision to the CD-HIT-EST process script in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(
            current_run_dir,
            os.path.basename(get_cd_hit_est_process_script())))
        command = 'chmod u+x {0}/{1}'.format(
            current_run_dir, os.path.basename(get_cd_hit_est_process_script()))
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # build the CD-HIT-EST process starter
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Building the process starter {0} ...\n'.format(
            get_cd_hit_est_process_starter()))
        (OK, error_list) = build_cd_hit_est_process_starter(current_run_dir)
        if OK:
            log.write('The file is built.\n')
        if not OK:
            log.write('***ERROR: The file could not be built.\n')

    # upload the CD-HIT-EST process starter to the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write(
            'Uploading the process starter {0} to the directory {1} of the master ...\n'
            .format(get_cd_hit_est_process_starter(), current_run_dir))
        cluster_path = '{0}/{1}'.format(
            current_run_dir,
            os.path.basename(get_cd_hit_est_process_starter()))
        (OK, error_list) = xssh.put_file(sftp_client,
                                         get_cd_hit_est_process_starter(),
                                         cluster_path)
        if OK:
            log.write('The file is uploaded.\n')
        else:
            for error in error_list:
                log.write('{0}\n'.format(error))

    # set run permision to the CD-HIT-EST process starter in the cluster
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Setting on the run permision of {0}/{1} ...\n'.format(
            current_run_dir,
            os.path.basename(get_cd_hit_est_process_starter())))
        command = 'chmod u+x {0}/{1}'.format(
            current_run_dir,
            os.path.basename(get_cd_hit_est_process_starter()))
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            log.write('The run permision is set.\n')
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # submit the CD-HIT-EST process
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Submitting the process script {0}/{1} ...\n'.format(
            current_run_dir,
            os.path.basename(get_cd_hit_est_process_starter())))
        sge_env = xcluster.get_sge_env()
        command = '{0}; qsub -V -b n -cwd {1}/{2}'.format(
            sge_env, current_run_dir,
            os.path.basename(get_cd_hit_est_process_starter()))
        (OK, stdout,
         stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            for line in stdout:
                log.write('{0}\n'.format(line))
        else:
            log.write('*** ERROR: Wrong command ---> {0}\n'.format(command))

    # close the SSH transport connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Closing the SSH transport connection ...\n')
        xssh.close_ssh_transport_connection(ssh_transport)
        log.write('The connection is closed.\n')

    # close the SSH client connection
    if OK:
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('Closing the SSH client connection ...\n')
        xssh.close_ssh_client_connection(ssh_client)
        log.write('The connection is closed.\n')

    # warn that the log window can be closed
    if not isinstance(log, xlib.DevStdOut):
        log.write('{0}\n'.format(xlib.get_separator()))
        log.write('You can close this window now.\n')

    # execute final function
    if function is not None:
        function()

    # return the control variable
    return OK
Esempio n. 8
0
def validate_quast_config_file(strict):
    '''
    Validate the QUAST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        quast_option_dict = xlib.get_option_dict(get_quast_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in quast_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = quast_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = quast_option_dict.get('identification', {}).get('reference_dataset_id', not_found)
            if reference_dataset_id == not_found:
                error_list.append('*** ERROR: the key "reference_dataset_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "reference_file"
            reference_file = quast_option_dict.get('identification', {}).get('reference_file', not_found)
            if reference_file == not_found:
                error_list.append('*** ERROR: the key "reference_file" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = quast_option_dict.get('identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = quast_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = quast_option_dict.get('identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    OK = False

        # check section "QUAST parameters"
        if 'QUAST parameters' not in sections_list:
            error_list.append('*** ERROR: the section "QUAST parameters" is not found.')
            OK = False
        else:

            # check section "QUAST parameters" - key "threads"
            threads = quast_option_dict.get('QUAST parameters', {}).get('threads', not_found)
            if threads == not_found:
                error_list.append('*** ERROR: the key "threads" is not found in the section "QUAST parameters".')
                OK = False
            else:
                try:
                    if int(threads) < 1:
                        error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.')
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "threads" in the section "QUAST parameters" must be an integer value greater or equal to 1.')
                    OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_quast_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 9
0
def create_quast_config_file(experiment_id='exp001', reference_dataset_id='NONE', reference_file='NONE', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'):
    '''
    Create QUAST config file with the default options. It is necessary
    update the options in each run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # set the app
    if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
        assembly_software = xlib.get_soapdenovotrans_code()
    elif assembly_dataset_id.startswith(xlib.get_transabyss_code()):
        assembly_software = xlib.get_transabyss_code()
    elif assembly_dataset_id.startswith(xlib.get_trinity_code()):
        assembly_software = xlib.get_trinity_code()
    elif assembly_dataset_id.startswith(xlib.get_star_code()):
        assembly_software = xlib.get_star_code()
    elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()):
        assembly_software = xlib.get_cd_hit_est_code()
    elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
        assembly_software = xlib.get_transcript_filter_code()

    # create the QUAST config file and write the default options
    try:
        if not os.path.exists(os.path.dirname(get_quast_config_file())):
            os.makedirs(os.path.dirname(get_quast_config_file()))
        with open(get_quast_config_file(), mode='w', encoding='utf8') as file_id:
            file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir())))
            file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir())))
            file_id.write('{0}\n'.format('# The experiment_id, reference_dataset_id, reference_file and assembly_dataset_id are fixed in the identification section.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# You can consult the parameters of QUAST and their meaning in http://quast.sourceforge.net/quast.html.'))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information identifies the experiment.'))
            file_id.write('{0}\n'.format('[identification]'))
            file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification'))
            file_id.write('{0:<50} {1}\n'.format('reference_dataset_id = {0}'.format(reference_dataset_id), '# reference dataset identification or NONE'))
            file_id.write('{0:<50} {1}\n'.format('reference_file = {0}'.format(reference_file), '# reference file name or NONE'))
            file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name())))
            file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification'))
            file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(),  xlib.get_transabyss_name(),  xlib.get_trinity_name(),  xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information to set the QUAST parameters'))
            file_id.write('{0}\n'.format('[QUAST parameters]'))
            file_id.write('{0:<50} {1}\n'.format('threads = 2', '# number of threads for use'))
    except:
        error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_quast_config_file()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 10
0
def form_list_cluster_experiment_processes():
    '''
    List the processes of an experiment in the cluster.
    '''

    # initialize the control variable
    OK = True

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment('Logs - List experiment processes in the cluster')

    # get the cluster name
    print(xlib.get_separator())
    if xec2.get_running_cluster_list(volume_creator_included=False) != []:
        cluster_name = cinputs.input_cluster_name(volume_creator_included=False, help=True)
    else:
        print('WARNING: There is not any running cluster.')
        OK = False

    # create the SSH client connection
    if OK:
        (OK, error_list, ssh_client) = xssh.create_ssh_client_connection(cluster_name, 'master')
        for error in error_list:
            log.write('{0}\n'.format(error))

    # get experiment identification
    if OK:
        experiment_id = cinputs.input_experiment_id(ssh_client, help=True)
        if experiment_id == '':
            print('WARNING: The cluster {0} has not experiment data.'.format(cluster_name))
            OK = False

    # get the result dataset list of the experiment
    if OK:
        command = 'cd  {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(xlib.get_cluster_result_dir(), experiment_id)
        (OK, stdout, stderr) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            result_dataset_id_list = []
            for line in stdout:
                line = line.rstrip('\n')
                if line != 'lost+found':
                    result_dataset_id_list.append(line)

    # print the result dataset identification list of the experiment
    if OK:
        print(xlib.get_separator())
        if result_dataset_id_list == []:
            print('*** WARNING: There is not any result dataset of the experiment {0}.'.format(experiment_id))
        else:
            result_dataset_id_list.sort()
            # set data width
            result_dataset_width = 25
            bioinfo_app_width = 25
            # set line template
            line_template = '{0:' + str(result_dataset_width) + '}   {1:' + str(bioinfo_app_width) + '}'
            # print header
            print(line_template.format('Result dataset', 'Bioinfo app / Utility'))
            print(line_template.format('=' * result_dataset_width, '=' * bioinfo_app_width))
            # print detail lines
            for result_dataset_id in result_dataset_id_list:
                if result_dataset_id.startswith(xlib.get_bedtools_code()+'-'):
                    bioinfo_app_name = xlib.get_bedtools_name()
                elif result_dataset_id.startswith(xlib.get_blastplus_code()+'-'):
                    bioinfo_app_name = xlib.get_blastplus_name()
                elif result_dataset_id.startswith(xlib.get_bowtie2_code()+'-'):
                    bioinfo_app_name = xlib.get_bowtie2_name()
                elif result_dataset_id.startswith(xlib.get_busco_code()+'-'):
                    bioinfo_app_name = xlib.get_busco_name()
                elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'):
                    bioinfo_app_name = xlib.get_cd_hit_est_name()
                elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'):
                    bioinfo_app_name = xlib.get_cd_hit_est_name()
                elif result_dataset_id.startswith(xlib.get_detonate_code()+'-'):
                    bioinfo_app_name = xlib.get_detonate_name()
                elif result_dataset_id.startswith(xlib.get_emboss_code()+'-'):
                    bioinfo_app_name = xlib.get_emboss_name()
                elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'):
                    bioinfo_app_name = xlib.get_fastqc_name()
                elif result_dataset_id.startswith(xlib.get_gmap_code()+'-'):
                    bioinfo_app_name = xlib.get_gmap_name()
                elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code()+'-'):
                    bioinfo_app_name = xlib.get_gmap_gsnap_name()
                elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'):
                    bioinfo_app_name = xlib.get_gzip_name()
                elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'):
                    bioinfo_app_name = xlib.get_insilico_read_normalization_name()
                elif result_dataset_id.startswith(xlib.get_miniconda3_code()+'-'):
                    bioinfo_app_name = xlib.get_miniconda3_name()
                elif result_dataset_id.startswith(xlib.get_ngshelper_code()+'-'):
                    bioinfo_app_name = xlib.get_ngshelper_name()
                elif result_dataset_id.startswith(xlib.get_quast_code()+'-'):
                    bioinfo_app_name = xlib.get_quast_name()
                elif result_dataset_id.startswith(xlib.get_r_code()+'-'):
                    bioinfo_app_name = xlib.get_r_name()
                elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'):
                    bioinfo_app_name = xlib.get_ref_eval_name()
                elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'):
                    bioinfo_app_name = xlib.get_rnaquast_name()
                elif result_dataset_id.startswith(xlib.get_rsem_code()+'-'):
                    bioinfo_app_name = xlib.get_rsem_name()
                elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'):
                    bioinfo_app_name = xlib.get_rsem_eval_name()
                elif result_dataset_id.startswith(xlib.get_samtools_code()+'-'):
                    bioinfo_app_name = xlib.get_samtools_name()
                elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'):
                    bioinfo_app_name = xlib.get_soapdenovotrans_name()
                elif result_dataset_id.startswith(xlib.get_star_code()+'-'):
                    bioinfo_app_name = xlib.get_star_name()
                elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'):
                    bioinfo_app_name = xlib.get_transabyss_name()
                elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'):
                    bioinfo_app_name = xlib.get_transcript_filter_name()
                elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'):
                    bioinfo_app_name = xlib.get_transcriptome_blastx_name()
                elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'):
                    bioinfo_app_name = xlib.get_transrate_name()
                elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'):
                    bioinfo_app_name = xlib.get_trimmomatic_name()
                elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'):
                    bioinfo_app_name = xlib.get_trinity_name()
                else:
                    bioinfo_app_name = 'xxx'
                print(line_template.format(result_dataset_id, bioinfo_app_name))

    # close the SSH client connection
    if OK:
        xssh.close_ssh_client_connection(ssh_client)

    # show continuation message 
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')
Esempio n. 11
0
def create_busco_config_file(experiment_id='exp001', assembly_dataset_id='sdnt-170101-235959', assembly_type='CONTIGS'):
    '''
    Create BUSCO config file with the default options. It is necessary
    update the options in each run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # set the assembly software
    if assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
        assembly_software = xlib.get_soapdenovotrans_code()
    elif assembly_dataset_id.startswith(xlib.get_transabyss_code()):
        assembly_software = xlib.get_transabyss_code()
    elif assembly_dataset_id.startswith(xlib.get_trinity_code()):
        assembly_software = xlib.get_trinity_code()
    elif assembly_dataset_id.startswith(xlib.get_star_code()):
        assembly_software = xlib.get_star_code()
    elif assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()):
        assembly_software = xlib.get_cd_hit_est_code()
    elif assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
        assembly_software = xlib.get_transcript_filter_code()

    # create the BUSCO config file and write the default options
    try:
        if not os.path.exists(os.path.dirname(get_busco_config_file())):
            os.makedirs(os.path.dirname(get_busco_config_file()))
        with open(get_busco_config_file(), mode='w', encoding='utf8') as file_id:
            file_id.write('{0}\n'.format('# You must review the information of this file and update the values with the corresponding ones to the current run.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# The reference file must be located in the cluster directory {0}/experiment_id/reference_dataset_id'.format(xlib.get_cluster_reference_dir())))
            file_id.write('{0}\n'.format('# The assembly files must be located in the cluster directory {0}/experiment_id/assembly_dataset_id'.format(xlib.get_cluster_result_dir())))
            file_id.write('{0}\n'.format('# The experiment_id and assembly_dataset_id names are fixed in the identification section.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# In section "BUSCO parameters", the key "augustus_options" allows you to input additional August parameters in the format:'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('#    augustus_options = --parameter-1[=value-1][; --parameter-2[=value-2][; ...; --parameter-n[=value-n]]]'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# parameter-i is a parameter name of Augustus and value-i a valid value of parameter-i, e.g.'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('#    augustus_options = --translation_table=6 --progress=true)'))
            file_id.write('{0}\n'.format('#'))
            file_id.write('{0}\n'.format('# You can consult the parameters of BUSCO and their meaning in http://busco.ezlab.org/.'))
            file_id.write('{0}\n'.format('# and August ones in http://bioinf.uni-greifswald.de/augustus/.'))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information identifies the experiment.'))
            file_id.write('{0}\n'.format('[identification]'))
            file_id.write('{0:<50} {1}\n'.format('experiment_id = {0}'.format(experiment_id), '# experiment identification'))
            file_id.write('{0:<50} {1}\n'.format('assembly_software = {0}'.format(assembly_software), '# assembly software: {0} ({1}) or {2} ({3}) or {4} ({5}) or {6} ({7}) or {8} ({9}) or {10} ({11})'.format(xlib.get_soapdenovotrans_code(), xlib.get_soapdenovotrans_name(), xlib.get_transabyss_code(), xlib.get_transabyss_name(), xlib.get_trinity_code(), xlib.get_trinity_name(), xlib.get_star_code(), xlib.get_star_name(), xlib.get_cd_hit_est_code(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code(), xlib.get_transcript_filter_name())))
            file_id.write('{0:<50} {1}\n'.format('assembly_dataset_id = {0}'.format(assembly_dataset_id), '# assembly dataset identification'))
            file_id.write('{0:<50} {1}\n'.format('assembly_type = {0}'.format(assembly_type), '# CONTIGS or SCAFFOLDS in {0}; NONE in {1}, {2}, {3}, {4} and {5}'.format(xlib.get_soapdenovotrans_name(),  xlib.get_transabyss_name(),  xlib.get_trinity_name(),  xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name())))
            file_id.write('{0}\n'.format(''))
            file_id.write('{0}\n'.format('# This section has the information to set the BUSCO parameters'))
            file_id.write('{0}\n'.format('[BUSCO parameters]'))
            file_id.write('{0:<50} {1}\n'.format('ncpu = 2', '# number of threads/cores for use'))
            file_id.write('{0:<50} {1}\n'.format('lineage_data = embryophyta_odb9', '# value to find the lineage data url in BUSCO web (e.g. embryophyta -> http://busco.ezlab.org/v2/datasets/embryophyta_odb9.tar.gz)'))
            file_id.write('{0:<50} {1}\n'.format('mode = tran', '# geno (genome assemblies, DNA) or tran (transcriptome assemblies, DNA) or prot (annotated gene sets, proteins)'))
            file_id.write('{0:<50} {1}\n'.format('evalue = 1e-03', '# E-value cutoff for BLAST searches'))
            file_id.write('{0:<50} {1}\n'.format('limit = 3', '# number of candidate regions to consider'))
            file_id.write('{0:<50} {1}\n'.format('species = NONE', '# identifier of existing Augustus species gene finding parameters or NONE'))
            file_id.write('{0:<50} {1}\n'.format('long = NO', '# Augustus optimization mode for self-training: YES or NO'))
            file_id.write('{0:<50} {1}\n'.format('augustus_options = NONE', '# additional parameters to August or NONE'))
    except:
        error_list.append('*** ERROR: The file {0} can not be recreated'.format(get_busco_config_file()))
        OK = False

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 12
0
def validate_busco_config_file(strict):
    '''
    Validate the BUSCO config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        busco_option_dict = xlib.get_option_dict(get_busco_config_file())
    except:
        error_list.append('*** ERROR: The syntax is WRONG.')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in busco_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = busco_option_dict.get('identification', {}).get('experiment_id', not_found)
            is_experiment_id_OK = True
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                is_experiment_id_OK = False
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = busco_option_dict.get('identification', {}).get('assembly_software', not_found)
            is_assembly_software_OK = True
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                is_assembly_software_OK = False
                OK = False
            elif assembly_software not in [xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()]:
                error_list.append('*** ERROR: the key "assembly_software" value in the section "identification" must be {0} or {1} or {2} or {3} or {4} OR {5}.'.format(xlib.get_soapdenovotrans_code(), xlib.get_transabyss_code(), xlib.get_trinity_code(), xlib.get_star_code(), xlib.get_cd_hit_est_code(), xlib.get_transcript_filter_code()))
                is_assembly_software_OK = False
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = busco_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            is_assembly_dataset_id_OK = True
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                is_assembly_dataset_id_OK = False
                OK = False
            elif not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and not assembly_dataset_id.startswith(xlib.get_transabyss_code()) and not assembly_dataset_id.startswith(xlib.get_trinity_code()) and not assembly_dataset_id.startswith(xlib.get_star_code()) and not assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) and not assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                error_list.append('*** ERROR: the key "assembly_dataset_id" value is not a {0} nor {1} nor {2} nor {3} nor {4} nor {5} assembly.'.format(xlib.get_soapdenovotrans_name(), xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_code()))
                is_assembly_dataset_id_OK = False
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = busco_option_dict.get('identification', {}).get('assembly_type', not_found)
            is_assembly_type_OK = True
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                is_assembly_type_OK = False
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()):
                if assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS']:
                    error_list.append('*** ERROR: the key "assembly_type" must be "CONTIGS" or "SCAFFOLDS" when {0} is the assembly software.'.format(xlib.get_soapdenovotrans_name()))
                    is_assembly_type_OK = False
                    OK = False
            elif assembly_dataset_id.startswith(xlib.get_transabyss_code()) or assembly_dataset_id.startswith(xlib.get_trinity_code()) or assembly_dataset_id.startswith(xlib.get_star_code()) or assembly_dataset_id.startswith(xlib.get_cd_hit_est_code()) or assembly_dataset_id.startswith(xlib.get_transcript_filter_code()):
                if assembly_type.upper() != 'NONE':
                    error_list.append('*** ERROR: the key "assembly_type" must be "NONE" when {0} or {1} or {2} or {3} or {4} is the assembly software.'.format(xlib.get_transabyss_name(), xlib.get_trinity_name(), xlib.get_star_name(), xlib.get_cd_hit_est_name(), xlib.get_transcript_filter_name()))
                    is_assembly_type_OK = False
                    OK = False

        # check section "BUSCO parameters"
        if 'BUSCO parameters' not in sections_list:
            error_list.append('*** ERROR: the section "BUSCO parameters" is not found.')
            OK = False
        else:

            # check section "BUSCO parameters" - key "ncpu"
            ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found)
            is_ncpu_OK = True
            if ncpu == not_found:
                error_list.append('*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".')
                is_ncpu_OK = False
                OK = False
            else:
                try:
                    if int(ncpu) < 1:
                        error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                        is_ncpu_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "ncpu" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                    is_ncpu_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "lineage_data"
            lineage_data = busco_option_dict.get('BUSCO parameters', {}).get('lineage_data', not_found)
            is_lineage_data_OK = True
            if lineage_data == not_found:
                error_list.append('*** ERROR: the key "lineage_data" is not found in the section "BUSCO parameters"')
                is_lineage_data_OK = False
                OK = False

            # check section "BUSCO parameters" - key "mode"
            mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found).lower()
            is_mode_OK = True
            if mode == not_found:
                error_list.append('*** ERROR: the key "mode" is not found in the section "BUSCO parameters".')
                is_mode_OK = False
                OK = False
            elif mode not in ['geno', 'tran', 'prot']:
                error_list.append('*** ERROR: the key "mode" value in the section "BUSCO parameters" must be geno or tran or prot.')
                is_mode_OK = False
                OK = False

            # check section "BUSCO parameters" - key "evalue"
            evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found)
            is_evalue_OK = True
            if evalue == not_found:
                error_list.append('*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".')
                is_evalue_OK = False
                OK = False
            else:
                try:
                    if float(evalue) <= 0:
                        error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.')
                        is_evalue_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "evalue" in the section "BUSCO parameters" must be a float value greater than 0.')
                    is_evalue_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "limit"
            limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found)
            is_limit_OK = True
            if limit == not_found:
                error_list.append('*** ERROR: the key "limit" is not found in the section "BUSCO parameters".')
                OK = False
            else:
                try:
                    if int(limit) < 1:
                        error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                        is_limit_OK = False
                        OK = False
                except:
                    error_list.append('*** ERROR: the key "limit" in the section "BUSCO parameters" must be an integer value greater or equal to 1.')
                    is_limit_OK = False
                    OK = False

            # check section "BUSCO parameters" - key "species"
            species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found)
            is_species_OK = True
            if species == not_found:
                error_list.append('*** ERROR: the key "species" is not found in the section "BUSCO parameters"')
                is_species_OK = False
                OK = False

            # check section "BUSCO parameters" - key "long"
            long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found).upper()
            is_long_OK = True
            if long == not_found:
                error_list.append('*** ERROR: the key "long" is not found in the section "BUSCO parameters".')
                is_long_OK = False
                OK = False
            elif long not in ['YES', 'NO']:
                error_list.append('*** ERROR: the key "long" value in the section "BUSCO parameters" must be YES or NO.')
                is_long_OK = False
                OK = False

            # check section "BUSCO parameters" - key "augustus_options"
            augustus_options = busco_option_dict.get('BUSCO parameters', {}).get('augustus_options', not_found)
            is_augustus_options_OK = True
            if augustus_options == not_found:
                error_list.append('*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".')
                is_augustus_options_OK = False
                OK = False
            else:
                if augustus_options.upper() != 'NONE':
                    parameter_list = [x.strip() for x in augustus_options.split(';')]
                    for parameter in parameter_list:
                        try:
                            if parameter.find('=') > 0:
                                pattern = r'^--(.+)=(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                                parameter_value = mo.group(2).strip()
                            else:
                                pattern = r'^--(.+)$'
                                mo = re.search(pattern, parameter)
                                parameter_name = mo.group(1).strip()
                        except:
                            error_list.append('*** ERROR: the value of the key "augustus_options" in the section "BUSCO parameters" must be NONE or a valid August parameter list.')
                            is_augustus_options_OK = False
                            OK = False
                            break

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append('\nThe {0} config file is not valid. Please, correct this file or recreate it.'.format(xlib.get_busco_name()))

    # return the control variable and the error list
    return (OK, error_list)
Esempio n. 13
0
    def execute(self, event=None):
        '''
        Execute the list the result logs in the cluster.
        '''

        # validate inputs
        OK = self.validate_inputs()
        if not OK:
            message = 'Some input values are not OK.'
            tkinter.messagebox.showerror('{0} - {1}'.format(xlib.get_project_name(), self.head), message)

        # get the run dictionary of the experiment
        if OK:
            # -- command = 'ls {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get())
            command = 'cd  {0}/{1}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get())
            (OK, stdout, stderr) = xssh.execute_cluster_command(self.ssh_client, command)
            if OK:
                result_dataset_dict = {}
                for line in stdout:
                    line = line.rstrip('\n')
                    if line != 'lost+found':
                        result_dataset_id = line
                        try:
                            pattern = r'^(.+)\-(.+)\-(.+)$'
                            mo = re.search(pattern, result_dataset_id)
                            bioinfo_app_code = mo.group(1).strip()
                            yymmdd = mo.group(2)
                            hhmmss = mo.group(3)
                            date = '20{0}-{1}-{2}'.format(yymmdd[:2], yymmdd[2:4], yymmdd[4:])
                            time = '{0}:{1}:{2}'.format(hhmmss[:2], hhmmss[2:4], hhmmss[4:])
                        except:
                            bioinfo_app_code = 'xxx'
                            date = '0000-00-00'
                            time = '00:00:00'
                        if result_dataset_id.startswith(xlib.get_bedtools_code()+'-'):
                            bioinfo_app_name = xlib.get_bedtools_name()
                        elif result_dataset_id.startswith(xlib.get_blastplus_code()+'-'):
                            bioinfo_app_name = xlib.get_blastplus_name()
                        elif result_dataset_id.startswith(xlib.get_bowtie2_code()+'-'):
                            bioinfo_app_name = xlib.get_bowtie2_name()
                        elif result_dataset_id.startswith(xlib.get_busco_code()+'-'):
                            bioinfo_app_name = xlib.get_busco_name()
                        elif result_dataset_id.startswith(xlib.get_cd_hit_code()+'-'):
                            bioinfo_app_name = xlib.get_cd_hit_name()
                        elif result_dataset_id.startswith(xlib.get_cd_hit_est_code()+'-'):
                            bioinfo_app_name = xlib.get_cd_hit_est_name()
                        elif result_dataset_id.startswith(xlib.get_detonate_code()+'-'):
                            bioinfo_app_name = xlib.get_detonate_name()
                        elif result_dataset_id.startswith(xlib.get_emboss_code()+'-'):
                            bioinfo_app_name = xlib.get_emboss_name()
                        elif result_dataset_id.startswith(xlib.get_fastqc_code()+'-'):
                            bioinfo_app_name = xlib.get_fastqc_name()
                        elif result_dataset_id.startswith(xlib.get_gmap_code()+'-'):
                            bioinfo_app_name = xlib.get_gmap_name()
                        elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code()+'-'):
                            bioinfo_app_name = xlib.get_gmap_gsnap_name()
                        elif result_dataset_id.startswith(xlib.get_gzip_code()+'-'):
                            bioinfo_app_name = xlib.get_gzip_name()
                        elif result_dataset_id.startswith(xlib.get_insilico_read_normalization_code()+'-'):
                            bioinfo_app_name = xlib.get_insilico_read_normalization_name()
                        elif result_dataset_id.startswith(xlib.get_miniconda3_code()+'-'):
                            bioinfo_app_name = xlib.get_miniconda3_name()
                        elif result_dataset_id.startswith(xlib.get_ngshelper_code()+'-'):
                            bioinfo_app_name = xlib.get_ngshelper_name()
                        elif result_dataset_id.startswith(xlib.get_quast_code()+'-'):
                            bioinfo_app_name = xlib.get_quast_name()
                        elif result_dataset_id.startswith(xlib.get_r_code()+'-'):
                            bioinfo_app_name = xlib.get_r_name()
                        elif result_dataset_id.startswith(xlib.get_ref_eval_code()+'-'):
                            bioinfo_app_name = xlib.get_ref_eval_name()
                        elif result_dataset_id.startswith(xlib.get_rnaquast_code()+'-'):
                            bioinfo_app_name = xlib.get_rnaquast_name()
                        elif result_dataset_id.startswith(xlib.get_rsem_code()+'-'):
                            bioinfo_app_name = xlib.get_rsem_name()
                        elif result_dataset_id.startswith(xlib.get_rsem_eval_code()+'-'):
                            bioinfo_app_name = xlib.get_rsem_eval_name()
                        elif result_dataset_id.startswith(xlib.get_samtools_code()+'-'):
                            bioinfo_app_name = xlib.get_samtools_name()
                        elif result_dataset_id.startswith(xlib.get_soapdenovotrans_code()+'-'):
                            bioinfo_app_name = xlib.get_soapdenovotrans_name()
                        elif result_dataset_id.startswith(xlib.get_star_code()+'-'):
                            bioinfo_app_name = xlib.get_star_name()
                        elif result_dataset_id.startswith(xlib.get_transabyss_code()+'-'):
                            bioinfo_app_name = xlib.get_transabyss_name()
                        elif result_dataset_id.startswith(xlib.get_transcript_filter_code()+'-'):
                            bioinfo_app_name = xlib.get_transcript_filter_name()
                        elif result_dataset_id.startswith(xlib.get_transcriptome_blastx_code()+'-'):
                            bioinfo_app_name = xlib.get_transcriptome_blastx_name()
                        elif result_dataset_id.startswith(xlib.get_transrate_code()+'-'):
                            bioinfo_app_name = xlib.get_transrate_name()
                        elif result_dataset_id.startswith(xlib.get_trimmomatic_code()+'-'):
                            bioinfo_app_name = xlib.get_trimmomatic_name()
                        elif result_dataset_id.startswith(xlib.get_trinity_code()+'-'):
                            bioinfo_app_name = xlib.get_trinity_name()
                        else:
                            bioinfo_app_name = 'xxx'
                        result_dataset_dict[result_dataset_id] = {'experiment_id': self.wrapper_experiment_id.get(), 'result_dataset_id': result_dataset_id, 'bioinfo_app': bioinfo_app_name, 'date': date, 'time': time}

        # verify if there are any nodes running
        if OK:
            if result_dataset_dict == {}:
                message = 'There is not any run.'
                tkinter.messagebox.showwarning('{0} - {1}'.format(xlib.get_project_name(), self.head), message)

        # build the data list
        if OK:
            data_list = ['experiment_id', 'result_dataset_id', 'bioinfo_app', 'date', 'time']

        # build the data dictionary
        if OK:
            data_dict = {}
            data_dict['experiment_id']= {'text': 'Experiment id. / Process', 'width': 200, 'aligment': 'left'}
            data_dict['result_dataset_id'] = {'text': 'Result dataset', 'width': 200, 'aligment': 'left'}
            data_dict['bioinfo_app'] = {'text': 'Bioinfo app / Utility', 'width': 200, 'aligment': 'left'}
            data_dict['date'] = {'text': 'Date', 'width': 80, 'aligment': 'right'}
            data_dict['time'] = {'text': 'Time', 'width': 80, 'aligment': 'right'}

        # create the dialog Table to show the nodes running
        if OK:
            dialog_table = gdialogs.DialogTable(self, 'Experiment runs in {0}/{1}'.format(xlib.get_cluster_result_dir(), self.wrapper_experiment_id.get()), 400, 900, data_list, data_dict, result_dataset_dict, 'view_result_logs', [self.wrapper_cluster_name.get()])
            self.wait_window(dialog_table)

        # close the form
        if OK:
            self.close()
Esempio n. 14
0
def form_list_cluster_experiment_processes():
    '''
    List the processes of an experiment in the cluster.
    '''

    # initialize the control variable
    OK = True

    # print the header
    clib.clear_screen()
    clib.print_headers_with_environment(
        'Logs - List experiment processes in the cluster')

    # get the cluster name
    print(xlib.get_separator())
    if xec2.get_running_cluster_list(only_environment_cluster=True,
                                     volume_creator_included=False) != []:
        cluster_name = cinputs.input_cluster_name(
            volume_creator_included=False, help=True)
    else:
        print('WARNING: There is not any running cluster.')
        OK = False

    # create the SSH client connection
    if OK:
        (OK, error_list,
         ssh_client) = xssh.create_ssh_client_connection(cluster_name)
        for error in error_list:
            print(error)

    # get experiment identification
    if OK:
        experiment_id = cinputs.input_experiment_id(ssh_client, help=True)
        if experiment_id == '':
            print(
                f'WARNING: The cluster {cluster_name} does not have experiment data.'
            )
            OK = False

    # get the result dataset list of the experiment
    if OK:
        command = f'cd  {xlib.get_cluster_result_dir()}/{experiment_id}; for list in `ls`; do ls -ld $list | grep -v ^- > /dev/null && echo $list; done;'
        (OK, stdout, _) = xssh.execute_cluster_command(ssh_client, command)
        if OK:
            result_dataset_id_list = []
            for line in stdout:
                line = line.rstrip('\n')
                if line != 'lost+found':
                    result_dataset_id_list.append(line)

    # print the result dataset identification list of the experiment
    if OK:
        print(xlib.get_separator())
        if result_dataset_id_list == []:
            print(
                f'*** WARNING: There is not any result dataset of the experiment {experiment_id}.'
            )
        else:
            result_dataset_id_list.sort()
            # set data width
            result_dataset_width = 30
            bioinfo_app_width = 25
            # set line
            line = '{0:' + str(result_dataset_width) + '}   {1:' + str(
                bioinfo_app_width) + '}'
            # print header
            print(line.format('Result dataset', 'Bioinfo app / Utility'))
            print(
                line.format('=' * result_dataset_width,
                            '=' * bioinfo_app_width))
            # print detail lines
            for result_dataset_id in result_dataset_id_list:

                if result_dataset_id.startswith(xlib.get_bedtools_code() +
                                                '-'):
                    bioinfo_app_name = xlib.get_bedtools_name()

                elif result_dataset_id.startswith(xlib.get_blastplus_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_blastplus_name()

                elif result_dataset_id.startswith(xlib.get_bcftools_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_bcftools_name()

                elif result_dataset_id.startswith(xlib.get_bowtie2_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_bowtie2_name()

                elif result_dataset_id.startswith(xlib.get_busco_code() + '-'):
                    bioinfo_app_name = xlib.get_busco_name()

                elif result_dataset_id.startswith(xlib.get_cd_hit_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cd_hit_name()

                elif result_dataset_id.startswith(xlib.get_cd_hit_est_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cd_hit_est_name()

                elif result_dataset_id.startswith(xlib.get_cuffdiff_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cuffdiff_name()

                elif result_dataset_id.startswith(xlib.get_cufflinks_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cufflinks_name()

                elif result_dataset_id.startswith(
                        xlib.get_cufflinks_cuffmerge_code() + '-'):
                    bioinfo_app_name = xlib.get_cufflinks_cuffmerge_name()

                elif result_dataset_id.startswith(xlib.get_cuffnorm_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cuffnorm_name()

                elif result_dataset_id.startswith(xlib.get_cuffquant_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cuffquant_name()

                elif result_dataset_id.startswith(xlib.get_cutadapt_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_cutadapt_name()

                elif result_dataset_id.startswith(
                        xlib.get_ddradseq_simulation_code() + '-'):
                    bioinfo_app_name = xlib.get_ddradseq_simulation_name()

                elif result_dataset_id.startswith(
                        xlib.get_ddradseqtools_code() + '-'):
                    bioinfo_app_name = xlib.get_ddradseqtools_name()

                elif result_dataset_id.startswith(xlib.get_detonate_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_detonate_name()

                elif result_dataset_id.startswith(xlib.get_diamond_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_diamond_name()

                elif result_dataset_id.startswith(xlib.get_emboss_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_emboss_name()

                elif result_dataset_id.startswith(
                        xlib.get_entrez_direct_code() + '-'):
                    bioinfo_app_name = xlib.get_entrez_direct_name()

                elif result_dataset_id.startswith(xlib.get_express_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_express_name()

                elif result_dataset_id.startswith(xlib.get_fastqc_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_fastqc_name()

                elif result_dataset_id.startswith(xlib.get_ggtrinity_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_ggtrinity_name()

                elif result_dataset_id.startswith(xlib.get_gmap_gsnap_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_gmap_gsnap_name()

                elif result_dataset_id.startswith(xlib.get_gmap_code() + '-'):
                    bioinfo_app_name = xlib.get_gmap_name()

                elif result_dataset_id.startswith(xlib.get_gsnap_code() + '-'):
                    bioinfo_app_name = xlib.get_gsnap_name()

                elif result_dataset_id.startswith(xlib.get_gzip_code() + '-'):
                    bioinfo_app_name = xlib.get_gzip_name()

                elif result_dataset_id.startswith(xlib.get_hisat2_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_hisat2_name()

                elif result_dataset_id.startswith(xlib.get_htseq_code() + '-'):
                    bioinfo_app_name = xlib.get_htseq_name()

                elif result_dataset_id.startswith(xlib.get_htseq_count_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_htseq_count_name()

                elif result_dataset_id.startswith(
                        xlib.get_insilico_read_normalization_code() + '-'):
                    bioinfo_app_name = xlib.get_insilico_read_normalization_name(
                    )

                elif result_dataset_id.startswith(xlib.get_ipyrad_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_ipyrad_name()

                elif result_dataset_id.startswith(xlib.get_kallisto_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_kallisto_name()

                elif result_dataset_id.startswith(xlib.get_miniconda3_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_miniconda3_name()

                elif result_dataset_id.startswith(xlib.get_ngshelper_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_ngshelper_name()

                elif result_dataset_id.startswith(xlib.get_quast_code() + '-'):
                    bioinfo_app_name = xlib.get_quast_name()

                elif result_dataset_id.startswith(xlib.get_r_code() + '-'):
                    bioinfo_app_name = xlib.get_r_name()

                elif result_dataset_id.startswith(xlib.get_raddesigner_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_raddesigner_name()

                elif result_dataset_id.startswith(xlib.get_ref_eval_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_ref_eval_name()

                elif result_dataset_id.startswith(xlib.get_rnaquast_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_rnaquast_name()

                elif result_dataset_id.startswith(xlib.get_rsem_code() + '-'):
                    bioinfo_app_name = xlib.get_rsem_name()

                elif result_dataset_id.startswith(xlib.get_rsem_eval_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_rsem_eval_name()

                elif result_dataset_id.startswith(xlib.get_rsitesearch_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_rsitesearch_name()

                elif result_dataset_id.startswith(xlib.get_samtools_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_samtools_name()

                elif result_dataset_id.startswith(xlib.get_soapdenovo2_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_soapdenovo2_name()

                elif result_dataset_id.startswith(
                        xlib.get_soapdenovotrans_code() + '-'):
                    bioinfo_app_name = xlib.get_soapdenovotrans_name()

                elif result_dataset_id.startswith(xlib.get_star_code() + '-'):
                    bioinfo_app_name = xlib.get_star_name()

                elif result_dataset_id.startswith(xlib.get_starcode_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_starcode_name()

                elif result_dataset_id.startswith(xlib.get_toa_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_name()

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_basic_data_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_basic_data_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_dicots_04_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_dicots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_gene_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_gene_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_go_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_go_name()

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_gymno_01_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_gymno_01_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_interpro_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_interpro_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_monocots_04_code() +
                        '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_monocots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_download_taxonomy_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_download_taxonomy_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.
                        get_toa_process_gilist_viridiplantae_nucleotide_gi_code(
                        ) + '-'):
                    bioinfo_app_name = xlib.get_toa_process_gilist_viridiplantae_nucleotide_gi_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.
                        get_toa_process_gilist_viridiplantae_protein_gi_code()
                        + '-'):
                    bioinfo_app_name = xlib.get_toa_process_gilist_viridiplantae_protein_gi_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_basic_data_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_basic_data_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_dicots_04_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_dicots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_gene_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_gene_name()

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_go_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_go_name()

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_gymno_01_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_gymno_01_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_interpro_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_interpro_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_load_monocots_04_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_load_monocots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_merge_annotations_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_merge_annotations_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_nr_blastplus_db_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_nr_blastplus_db_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_nr_diamond_db_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_nr_diamond_db_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_nt_blastplus_db_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_nt_blastplus_db_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_pipeline_aminoacid_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_pipeline_aminoacid_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_pipeline_nucleotide_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_pipeline_nucleotide_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_proteome_dicots_04_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_proteome_dicots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_proteome_gymno_01_code() + '-'):
                    bioinfo_app_name = xlib.get_toa_process_proteome_gymno_01_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_proteome_monocots_04_code() +
                        '-'):
                    bioinfo_app_name = xlib.get_toa_process_proteome_monocots_04_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_proteome_refseq_plant_code() +
                        '-'):
                    bioinfo_app_name = xlib.get_toa_process_proteome_refseq_plant_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_rebuild_toa_database_code() +
                        '-'):
                    bioinfo_app_name = xlib.get_get_toa_process_rebuild_toa_database_name(
                    )

                elif result_dataset_id.startswith(
                        xlib.get_toa_process_recreate_toa_database_code() +
                        '-'):
                    bioinfo_app_name = xlib.get_get_toa_process_recreate_toa_database_name(
                    )

                elif result_dataset_id.startswith(xlib.get_tophat_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_tophat_name()

                elif result_dataset_id.startswith(xlib.get_transabyss_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_transabyss_name()

                elif result_dataset_id.startswith(
                        xlib.get_transcript_filter_code() + '-'):
                    bioinfo_app_name = xlib.get_transcript_filter_name()

                elif result_dataset_id.startswith(
                        xlib.get_transcriptome_blastx_code() + '-'):
                    bioinfo_app_name = xlib.get_transcriptome_blastx_name()

                elif result_dataset_id.startswith(
                        xlib.get_transdecoder_code() + '-'):
                    bioinfo_app_name = xlib.get_transdecoder_name()

                elif result_dataset_id.startswith(xlib.get_transrate_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_transrate_name()

                elif result_dataset_id.startswith(xlib.get_trimmomatic_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_trimmomatic_name()

                elif result_dataset_id.startswith(xlib.get_trinity_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_trinity_name()

                elif result_dataset_id.startswith(
                        xlib.get_variant_calling_code() + '-'):
                    bioinfo_app_name = xlib.get_variant_calling_name()

                elif result_dataset_id.startswith(xlib.get_vcftools_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_vcftools_name()

                elif result_dataset_id.startswith(
                        xlib.get_vcftools_perl_libraries_code() + '-'):
                    bioinfo_app_name = xlib.get_vcftools_perl_libraries_name()

                elif result_dataset_id.startswith(xlib.get_vsearch_code() +
                                                  '-'):
                    bioinfo_app_name = xlib.get_vsearch_name()

                else:
                    bioinfo_app_name = 'xxx'

                print(line.format(result_dataset_id, bioinfo_app_name))

    # close the SSH client connection
    if OK:
        xssh.close_ssh_client_connection(ssh_client)

    # show continuation message
    print(xlib.get_separator())
    input('Press [Intro] to continue ...')