コード例 #1
0
ファイル: cinputs.py プロジェクト: GGFHF/TOA
def input_int(text,
              default=None,
              minimum=(-sys.maxsize - 1),
              maximum=sys.maxsize):
    '''
    Input a integer number.
    '''

    # initialize the number
    literal = None

    # input and check the integer number
    while literal is None:
        if default is None:
            literal = input(f'{text}: ')
        else:
            literal = input(f'{text} [{default}]: ')
            if literal == '': literal = default
        if not xlib.check_int(literal, minimum, maximum):
            print(f'*** ERROR: {literal} is not a valid value.')
            literal = None

    # return the integer value
    return int(literal)
コード例 #2
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "vcf_file"
    if args.vcf_file is None:
        xlib.Message.print(
            'error',
            '*** The VCF file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.vcf_file):
        xlib.Message.print('error',
                           f'*** The file {args.vcf_file} does not exist.')
        OK = False

    # check "sample_file"
    if args.sample_file is None:
        xlib.Message.print(
            'error',
            '*** The sample file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.sample_file):
        xlib.Message.print('error',
                           f'*** The file {args.sample_file} does not exist.')
        OK = False

    # check "sp1_id"
    if args.sp1_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the first species is not indicated in the input arguments.'
        )
        OK = False

    # check "sp2_id"
    if args.sp2_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the second species is not indicated in the input arguments.'
        )
        OK = False

    # check "hybrid_id"
    if args.hybrid_id is None:
        args.hybrid_id = 'NONE'

    # check "output_dir"
    if args.output_dir is None:
        xlib.Message.print(
            'error',
            '*** The output directy is not indicated in the input arguments.')
        OK = False
    elif not os.path.isdir(args.output_dir):
        xlib.Message.print('error', '*** The output directy does not exist.')
        OK = False

    # check "variant_number_per_file"
    if args.variant_number_per_file is None:
        args.variant_number_per_file = xlib.Const.DEFAULT_VARIANT_NUMBER_PER_FILE
    elif not xlib.check_int(args.variant_number_per_file, minimum=1):
        xlib.Message.print(
            'error',
            'The variant number per file has to be an integer number greater than 0.'
        )
        OK = False
    else:
        args.variant_number_per_file = int(args.variant_number_per_file)

    # check "allele_transformation"
    if args.allele_transformation is None:
        args.allele_transformation = 'NONE'
    elif not xlib.check_code(args.allele_transformation,
                             xlib.get_allele_transformation_code_list(),
                             case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.'
        )
        OK = False
    else:
        args.allele_transformation = args.allele_transformation.upper()

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check "tvi_list"
    if args.tvi_list is None or args.tvi_list == 'NONE':
        args.tvi_list = []
    else:
        args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list)

    # check the identification set
    if OK:
        if args.sp1_id == args.sp2_id or \
           args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id):
            xlib.Message.print('error',
                               'The identifications must be different.')
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #3
0
ファイル: simhyb2structure.py プロジェクト: Chebuu/NGShelper
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "simhyb_file"
    if args.simhyb_file is None:
        xlib.Message.print(
            'error',
            '*** The SimHyb file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.simhyb_file):
        xlib.Message.print('error',
                           f'*** The file {args.simhyb_file} does not exist.')
        OK = False

    # check "header_row_number"
    if args.header_row_number is None:
        xlib.Message.print(
            'error',
            '*** The header row number in the SimHyb file is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.header_row_number, minimum=0):
        xlib.Message.print(
            'error',
            'The header row number in the SimHyb file has to be an integer number greater than or equalt to 0.'
        )
        OK = False
    else:
        args.header_row_number = int(args.header_row_number)

    # check "structure_file"
    if args.structure_file is None:
        xlib.Message.print(
            'error',
            '*** The converted Structure file is not indicated in the input arguments.'
        )
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #4
0
ファイル: vcf2structure.py プロジェクト: Chebuu/NGShelper
def convert_vcf_to_structure(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id,
                             imputed_md_id, new_md_id, allele_transformation,
                             structure_file_type, output_converted_file,
                             tvi_list):
    '''
    Convert a VCF file to the Structure input formats.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the sample information list
    sample_info_list = []

    # initialize the variant code list
    variant_code_list = []

    # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes
    gt_left_matrix = []
    gt_right_matrix = []

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # open the VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # initialize counters
    record_counter = 0
    variant_counter = 0

    # read the first record of VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample information list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                if species_id == sp1_id:
                    numeric_species_id = 1
                elif species_id == sp2_id:
                    numeric_species_id = 2
                else:
                    numeric_species_id = 3
                sample_info_list.append(
                    [record_data_list[i], numeric_species_id])

            # check if the sample information list is empty
            if sample_info_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(sample_info_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant records
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the VCF record counter
            record_counter += 1

            # add 1 to the variant counter
            variant_counter += 1

            # append variant code to the variant code list and write the code and its sequence identification and position in the variant file
            id = f'{data_dict["chrom"]}-{data_dict["pos"]}'
            variant_code_list.append(id)

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'(4) sample_gt_list: {sample_gt_list}')

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                try:
                    if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol():
                        sample_gt_left_list.append(new_md_id)
                    else:
                        sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    if sample_gt_list[i][sep_pos + 1:] == xlib.get_md_symbol():
                        sample_gt_right_list.append(new_md_id)
                    else:
                        sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                      1:])
                except Exception as e:
                    raise xlib.ProgramException(e, 'L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

            # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes
            gt_left_matrix.append(sample_gt_left_list)
            gt_right_matrix.append(sample_gt_right_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    vcf_file_id.close()

    # review the imputed missing data when the type of the converted file is 1
    if structure_file_type == '1':

        # detect variants with any imputed missing data
        excluded_variant_index_list = []
        for i in range(len(gt_left_matrix)):
            for j in range(sample_number):
                if gt_left_matrix[i][j] == imputed_md_id or gt_right_matrix[i][
                        j] == imputed_md_id:
                    excluded_variant_index_list.append(i)
                    break
        xlib.Message.print(
            'trace', 'excluded_variant_index_list: {}'.format(
                excluded_variant_index_list))

        # remove data of variants with any imputed missing data
        excluded_variant_index_list.reverse()
        for k in excluded_variant_index_list:
            variant_code_list.pop(k)
            gt_left_matrix.pop(k)
            gt_right_matrix.pop(k)

    # open the output converted file
    if output_converted_file.endswith('.gz'):
        try:
            output_converted_file_id = gzip.open(output_converted_file,
                                                 mode='wt',
                                                 encoding='iso-8859-1',
                                                 newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F004', output_converted_file)
    else:
        try:
            output_converted_file_id = open(output_converted_file,
                                            mode='w',
                                            encoding='iso-8859-1',
                                            newline='\n')
        except Exception as e:
            raise xlib.ProgramException(e, 'F003', output_converted_file)

    # write header record
    variant_code_list_text = '\t'.join(variant_code_list)
    output_converted_file_id.write(
        f'sample_id\tspecies_id\t{variant_code_list_text}\n')

    # write sample records
    for i in range(sample_number):

        # build left and right side lists of variants of a sample
        sample_variant_gt_left_list = []
        sample_variant_gt_right_list = []
        for j in range(len(gt_left_matrix)):
            # left
            if xlib.check_int(gt_left_matrix[j]
                              [i]) and allele_transformation == 'ADD100':
                allele_left = str(int(gt_left_matrix[j][i]) + 100)
            else:
                allele_left = gt_left_matrix[j][i]
            sample_variant_gt_left_list.append(allele_left)
            # right
            if xlib.check_int(gt_right_matrix[j]
                              [i]) and allele_transformation == 'ADD100':
                allele_right = str(int(gt_right_matrix[j][i]) + 100)
            else:
                allele_right = gt_right_matrix[j][i]
            sample_variant_gt_right_list.append(allele_right)

        # write the first record of the sample
        sample_variant_gt_left_list_text = '\t'.join(
            sample_variant_gt_left_list)
        output_converted_file_id.write(
            f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_left_list_text}\n'
        )
        # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_left_list)}\n')

        # write the second record of the sample
        sample_variant_gt_right_list_text = '\t'.join(
            sample_variant_gt_right_list)
        output_converted_file_id.write(
            f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_right_list_text}\n'
        )
        # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_right_list)}\n')

    # close file
    output_converted_file_id.close()

    # print OK message
    xlib.Message.print(
        'info',
        f'The converted file {os.path.basename(output_converted_file)} is created.'
    )
コード例 #5
0
ファイル: xbusco.py プロジェクト: GGFHF/NGScloud2
def check_busco_config_file(strict):
    '''
    Check the BUSCO config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        busco_option_dict = xlib.get_option_dict(get_busco_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in busco_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = busco_option_dict.get('identification', {}).get(
                'experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = busco_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_code(assembly_software,
                                     get_assembly_software_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = busco_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id,
                                           get_assembly_software_code_list(),
                                           case_sensitive=True):
                error_list.append(
                    f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = busco_option_dict.get('identification', {}).get(
                'assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                error_list.append(
                    f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.'
                )
                OK = False

        # check section "BUSCO parameters"
        if 'BUSCO parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "BUSCO parameters" is not found.')
            OK = False
        else:

            # check section "BUSCO parameters" - key "ncpu"
            ncpu = busco_option_dict.get('BUSCO parameters',
                                         {}).get('ncpu', not_found)
            if ncpu == not_found:
                error_list.append(
                    '*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_int(ncpu, minimum=1):
                error_list.append(
                    '*** ERROR: the key "ncpu" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "BUSCO parameters" - key "lineage_data_url"
            lineage_data_url = busco_option_dict.get(
                'BUSCO parameters', {}).get('lineage_data_url', not_found)
            if lineage_data_url == not_found:
                error_list.append(
                    '*** ERROR: the key "lineage_data_url" is not found in the section "BUSCO parameters"'
                )
                OK = False
            else:
                try:
                    urllib.request.urlopen(lineage_data_url)
                except Exception as e:
                    error_list.append(f'*** EXCEPTION: "{e}".')
                    error_list.append(
                        '*** ERROR: the key "lineage_data_url" has to be a reachable address.'
                    )
                    OK = False

            # check section "BUSCO parameters" - key "mode"
            mode = busco_option_dict.get('BUSCO parameters',
                                         {}).get('mode', not_found)
            if mode == not_found:
                error_list.append(
                    '*** ERROR: the key "mode" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    mode, get_mode_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.'
                )
                OK = False

            # check section "BUSCO parameters" - key "evalue"
            evalue = busco_option_dict.get('BUSCO parameters',
                                           {}).get('evalue', not_found)
            if evalue == not_found:
                error_list.append(
                    '*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_float(evalue, minimum=0., mne=1E-12):
                error_list.append(
                    '*** ERROR: the key "evalue" has to be a float number greater than 0.'
                )
                OK = False

            # check section "BUSCO parameters" - key "limit"
            limit = busco_option_dict.get('BUSCO parameters',
                                          {}).get('limit', not_found)
            if limit == not_found:
                error_list.append(
                    '*** ERROR: the key "limit" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_int(limit, minimum=1):
                error_list.append(
                    '*** ERROR: the key "limit" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "BUSCO parameters" - key "species"
            species = busco_option_dict.get('BUSCO parameters',
                                            {}).get('species', not_found)
            if species == not_found:
                error_list.append(
                    '*** ERROR: the key "species" is not found in the section "BUSCO parameters"'
                )
                OK = False

            # check section "BUSCO parameters" - key "long"
            long = busco_option_dict.get('BUSCO parameters',
                                         {}).get('long', not_found)
            if long == not_found:
                error_list.append(
                    '*** ERROR: the key "long" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    long, get_long_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "long" has to be {get_long_code_list_text()}.'
                )
                OK = False

            # check section "BUSCO parameters" - key "augustus_options"
            augustus_options = busco_option_dict.get(
                'BUSCO parameters', {}).get('augustus_options', not_found)
            if augustus_options == not_found:
                error_list.append(
                    '*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".'
                )
                OK = False
            elif augustus_options.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    augustus_options, "augustus_options", [])
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_busco_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
コード例 #6
0
def convert_vcf_to_phase_input(vcf_file, sample_file, sp1_id, sp2_id,
                               hybrid_id, imputed_md_id, allele_transformation,
                               output_dir, tvi_list):
    '''
    Convert a VCF file to the PHASE input format.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize the sample information list
    sample_info_list = []

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample species identification list per variant
    species_id_list = []

    # open the VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # initialize counters
    seq_counter = 0
    variant_counter = 0
    record_counter = 0

    # read the first record of VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in the VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the VCF record counter
            record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the VCF record counter
            record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample information list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                if species_id == sp1_id:
                    numeric_species_id = 1
                elif species_id == sp2_id:
                    numeric_species_id = 2
                else:
                    numeric_species_id = 3
                sample_info_list.append(
                    [record_data_list[i], numeric_species_id])

            # build the sample species list
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
            )

            # read the next record of the VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant records
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the sequence counter
            seq_counter += 1

            # initialize VCF record counter
            variant_counter = 0

            # save the sequence
            old_seq = data_dict['chrom']

            # initialize the list of variant positions
            variant_position_list = []

            # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes
            gt_left_matrix = []
            gt_right_matrix = []

            # initialize the list of the variant multiallelic status
            variant_multiallelic_status_list = []

            # process variant records of the same sequence
            while record != '' and not record.startswith(
                    '##') and not record.startswith(
                        '#CHROM') and data_dict['chrom'] == old_seq:

                # add 1 to the VCF record counter
                record_counter += 1

                # add 1 to the total variant counter
                variant_counter += 1

                # append position to the list of variant positions
                variant_position_list.append(data_dict['pos'])

                # get the position of the genotype (subfield GT) in the field FORMAT
                format_subfield_list = data_dict['format'].upper().split(':')
                try:
                    gt_position = format_subfield_list.index('GT')
                except Exception as e:
                    raise xlib.ProgramException(e, 'L007', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])

                # build the list of sample genotypes of a variant
                sample_gt_list = []
                for i in range(sample_number):
                    sample_data_list = data_dict['sample_list'][i].split(':')
                    sample_gt_list.append(sample_data_list[gt_position])
                if variant_id in tvi_list:
                    xlib.Message.print('trace',
                                       f'sample_gt_list: {sample_gt_list}')

                # build the lists of the left and right side of sample genotypes of a variant
                sample_gt_left_list = []
                sample_gt_right_list = []
                for i in range(sample_number):
                    sep = '/'
                    sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        sep = '|'
                        sep_pos = sample_gt_list[i].find(sep)
                    if sep_pos == -1:
                        raise xlib.ProgramException('', 'L008', 'GT',
                                                    data_dict['chrom'],
                                                    data_dict['pos'])
                    sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                    sample_gt_right_list.append(sample_gt_list[i][sep_pos +
                                                                  1:])

                # get the allele counters per species
                allele_counter_dict = {}
                for i in range(sample_number):
                    if sample_gt_left_list[i] != xlib.get_md_symbol():
                        allele_counter_dict[
                            sample_gt_left_list[i]] = allele_counter_dict.get(
                                sample_gt_left_list[i], 0) + 1
                    if sample_gt_right_list[i] != xlib.get_md_symbol():
                        allele_counter_dict[
                            sample_gt_right_list[i]] = allele_counter_dict.get(
                                sample_gt_right_list[i], 0) + 1
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace', f'allele_counter_dict: {allele_counter_dict}')

                # check if the variant is multiallelic
                if len(allele_counter_dict.keys()) > 2:
                    variant_multiallelic_status = 'M'
                else:
                    variant_multiallelic_status = 'S'
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'variant_multiallelic_status: {variant_multiallelic_status}.'
                    )

                # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes
                gt_left_matrix.append(sample_gt_left_list)
                gt_right_matrix.append(sample_gt_right_list)

                # append to the list of the variant multiallelic status
                variant_multiallelic_status_list.append(
                    variant_multiallelic_status)

                # print the counters
                xlib.Message.print(
                    'verbose',
                    f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}'
                )

                # read the next record of the VCF file
                (record, key,
                 data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

            # set output converted file of the sequence
            if vcf_file.endswith('.gz'):
                file_name, file_extension = os.path.splitext(
                    os.path.basename(vcf_file[:-3]))
            else:
                file_name, file_extension = os.path.splitext(
                    os.path.basename(vcf_file))
            seq_output_converted_file = f'{output_dir}/{file_name}-2phase-{old_seq}.txt'

            # open the output converted file
            if seq_output_converted_file.endswith('.gz'):
                try:
                    seq_output_converted_file_id = gzip.open(
                        seq_output_converted_file,
                        mode='wt',
                        encoding='iso-8859-1',
                        newline='\n')
                except Exception as e:
                    raise xlib.ProgramException(e, 'F004',
                                                seq_output_converted_file)
            else:
                try:
                    seq_output_converted_file_id = open(
                        seq_output_converted_file,
                        mode='w',
                        encoding='iso-8859-1',
                        newline='\n')
                except Exception as e:
                    raise xlib.ProgramException(e, 'F003',
                                                seq_output_converted_file)

            # write header records
            header_record_1 = f'{sample_number}\n'
            seq_output_converted_file_id.write(header_record_1)
            header_record_2 = f'{len(variant_position_list)}\n'
            seq_output_converted_file_id.write(header_record_2)
            header_record_3 = f'P {" ".join(variant_position_list)}\n'
            seq_output_converted_file_id.write(header_record_3)
            header_record_4 = f'{"".join(variant_multiallelic_status_list)}\n'
            seq_output_converted_file_id.write(header_record_4)

            # write sample records
            for i in range(sample_number):

                # build left and right side lists of variants of a sample
                sample_variant_gt_left_list = []
                sample_variant_gt_right_list = []
                for j in range(len(variant_position_list)):
                    # left
                    if gt_left_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'S':
                        allele_left = '?'
                    elif gt_left_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'M':
                        allele_left = '-1'
                    elif xlib.check_int(
                            gt_left_matrix[j]
                        [i]) and allele_transformation == 'ADD100':
                        allele_left = str(int(gt_left_matrix[j][i]) + 100)
                    else:
                        allele_left = gt_left_matrix[j][i]
                    sample_variant_gt_left_list.append(allele_left)
                    # right
                    if gt_right_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'S':
                        allele_right = '?'
                    elif gt_right_matrix[j][
                            i] == '.' and variant_multiallelic_status_list[
                                j] == 'M':
                        allele_right = '-1'
                    elif xlib.check_int(
                            gt_right_matrix[j]
                        [i]) and allele_transformation == 'ADD100':
                        allele_right = str(int(gt_right_matrix[j][i]) + 100)
                    else:
                        allele_right = gt_right_matrix[j][i]
                    sample_variant_gt_right_list.append(allele_right)

                # write the first record of the sample
                sample_record_1 = f'#{sample_info_list[i][0]}\n'
                seq_output_converted_file_id.write(sample_record_1)

                # write the second record of the sample
                sample_record_2 = f'{" ".join(sample_variant_gt_left_list)}\n'
                seq_output_converted_file_id.write(sample_record_2)

                # write the third record of the sample
                sample_record_3 = f'{" ".join(sample_variant_gt_right_list)}\n'
                seq_output_converted_file_id.write(sample_record_3)

            # close file
            seq_output_converted_file_id.close()

            xlib.Message.print('verbose', '\n')

            # print OK message
            xlib.Message.print(
                'info',
                f'The converted file {os.path.basename(seq_output_converted_file)} is created.'
            )

    # close VCF file
    vcf_file_id.close()
コード例 #7
0
ファイル: xcutadapt.py プロジェクト: GGFHF/NGScloud2
def check_cutadapt_config_file(strict):
    '''
    Check the cutadapt config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        cutadapt_option_dict = xlib.get_option_dict(get_cutadapt_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append('*** ERROR: The option dictionary could not be built from the config file')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in cutadapt_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = cutadapt_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "read_dataset_id"
            read_dataset_id = cutadapt_option_dict.get('identification', {}).get('read_dataset_id', not_found)
            if read_dataset_id == not_found:
                error_list.append('*** ERROR: the key "read_dataset_id" is not found in the section "identification".')
                OK = False

        # check section "cutadapt parameters"
        if 'cutadapt parameters' not in sections_list:
            error_list.append('*** ERROR: the section "cutadapt parameters" is not found.')
            OK = False
        else:

            # check section "cutadapt parameters" - key "cores"
            cores = cutadapt_option_dict.get('cutadapt parameters', {}).get('cores', not_found)
            if cores == not_found:
                error_list.append('*** ERROR: the key "cores" is not found in the section "cutadapt parameters".')
                OK = False
            elif not xlib.check_int(cores, minimum=0):
                error_list.append('*** ERROR: the key "cores" has to be an integer number greater than or equal to 0.')
                OK = False

            # check section "cutadapt parameters" - key "adapter"
            adapter = cutadapt_option_dict.get('cutadapt parameters', {}).get('adapter', not_found)
            if adapter == not_found:
                error_list.append('*** ERROR: the key "adapter" is not found in the section "cutadapt parameters".')
                OK = False
            elif adapter.upper() == 'NONE':
                error_list.append('*** ERROR: the key "adapter" has to be different from NONE.')
                OK = False

            # check section "cutadapt parameters" - key "adapter_pe"
            adapter_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('adapter_pe', not_found)
            is_ok_adapter_pe = False
            if adapter_pe == not_found:
                error_list.append('*** ERROR: the key "adapter_pe" is not found in the section "cutadapt parameters".')
                OK = False
            else:
                is_ok_adapter_pe = True

            # check section "cutadapt parameters" - key "front"
            front = cutadapt_option_dict.get('cutadapt parameters', {}).get('front', not_found)
            if front == not_found:
                error_list.append('*** ERROR: the key "front" is not found in the section "cutadapt parameters".')
                OK = False

            # check section "cutadapt parameters" - key "front_pe"
            front_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('front_pe', not_found)
            is_ok_front_pe = False
            if front_pe == not_found:
                error_list.append('*** ERROR: the key "front_pe" is not found in the section "cutadapt parameters".')
                OK = False
            else:
                is_ok_front_pe = True

            # check section "cutadapt parameters" - key "anywhere"
            anywhere = cutadapt_option_dict.get('cutadapt parameters', {}).get('anywhere', not_found)
            if anywhere == not_found:
                error_list.append('*** ERROR: the key "anywhere" is not found in the section "cutadapt parameters".')
                OK = False

            # check section "cutadapt parameters" - key "anywhere_pe"
            anywhere_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('anywhere_pe', not_found)
            is_ok_anywhere_pe = False
            if anywhere_pe == not_found:
                error_list.append('*** ERROR: the key "anywhere_pe" is not found in the section "cutadapt parameters".')
                OK = False
            else:
                is_ok_anywhere_pe = True

            # check section "cutadapt parameters" - key "other_parameters"
            not_allowed_parameters_list = ['cores', 'adapter', 'front', 'anywhere']
            other_parameters = cutadapt_option_dict.get('cutadapt parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append('*** ERROR: the key "other_parameters" is not found in the section "cutadapt parameters".')
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list)
                error_list = error_list + error_list2

        # check section "library"
        if 'library' not in sections_list:
            error_list.append('*** ERROR: the section "library" is not found.')
            OK = False
        else:

            # check section "library" - key "format"
            format = cutadapt_option_dict.get('library', {}).get('format', not_found)
            if format == not_found:
                error_list.append('*** ERROR: the key "format" is not found in the section "library".')
                OK = False
            elif not xlib.check_code(format, get_format_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "format" has to be {get_format_code_list_text()}.')
                OK = False

            # check section "library" - key "read_type"
            read_type = cutadapt_option_dict.get('library', {}).get('read_type', not_found)
            is_ok_read_type = False
            if read_type == not_found:
                error_list.append('*** ERROR: the key "read_type" is not found in the section "library".')
                OK = False
            elif not xlib.check_code(read_type, get_read_type_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "read_type" has to be {get_read_type_code_list_text()}.')
                OK = False
            else:
                is_ok_read_type = True

            # check "adapter_pe" is NONE if read type es SE
            if is_ok_read_type and is_ok_adapter_pe and read_type.upper() == 'SE' and adapter_pe.upper() != 'NONE':
                error_list.append('*** ERROR: the key "adapter_pe" has to be NONE when de read type is SE.')
                OK = False

            # check "front_pe" is NONE if read type es SE
            if is_ok_read_type and is_ok_front_pe and read_type.upper() == 'SE' and front_pe.upper() != 'NONE':
                error_list.append('*** ERROR: the key "front_pe" has to be NONE when de read type is SE.')
                OK = False

            # check "anywhere_pe" is NONE if read type es SE
            if is_ok_read_type and is_ok_anywhere_pe and read_type.upper() == 'SE' and anywhere_pe.upper() != 'NONE':
                error_list.append('*** ERROR: the key "anywhere_pe" has to be NONE when de read type is SE.')
                OK = False

        # check section "library-1"
        if 'library-1' not in sections_list:
            error_list.append('*** ERROR: the section "library-1" is not found.')
            OK = False

        # check all sections "library-n"
        for section in sections_list:

            if section not in ['identification', 'cutadapt parameters', 'library']:

                # check than the section identification is like library-n 
                if not re.match('^library-[0-9]+$', section):
                    error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.')
                    OK = False

                else:

                    # check section "library-n" - key "read_file_1"
                    read_file_1 = cutadapt_option_dict.get(section, {}).get('read_file_1', not_found)
                    if read_file_1 == not_found:
                        error_list.append(f'*** ERROR: the key "read_file_1" is not found in the section "{section}"')
                        OK = False

                    # check section "library-n" - key "read_file_2"
                    read_file_2 = cutadapt_option_dict.get(section, {}).get('read_file_2', not_found)
                    if read_file_2 == not_found:
                        error_list.append(f'*** ERROR: the key "read_file_2" is not found in the section "{section}"')
                        OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(f'\nThe {xlib.get_kallisto_name()} config file is not valid. Please, correct this file or recreate it.')

    # return the control variable and the error list
    return (OK, error_list)
コード例 #8
0
def check_args(args):
    '''
    Verity the input arguments data.
    '''

    # initialize the control variable
    OK = True

    # check the assembly_software_code value
    if args.assembly_software_code is None:
        xlib.Message.print(
            'error',
            '*** The assembly software that generated the transcritpme file is not indicated in the input arguments.'
        )
        OK = False
    elif args.assembly_software_code not in [
            xlib.Const.AS_TRINITY_CODE, xlib.Const.AS_SOAPDENOVOTRANS_CODE,
            xlib.Const.AS_GENERATED_BY_NGSCLOUD
    ]:
        xlib.Message.print(
            'error',
            f'*** {args.assembly_software_code} is not a valid code of assembly software.'
        )
        OK = False

    # check the transcriptome_file value
    if args.transcriptome_file is None:
        xlib.Message.print(
            'error',
            '*** A transcritpme file in Fasta format is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.transcriptome_file):
        xlib.Message.print(
            'error', f'*** The file {args.transcriptome_file} does not exist.')
        OK = False

    # check the score_file value
    if args.score_file is None:
        xlib.Message.print(
            'error',
            '*** A score file where RSEM-EVAL (DETONATE package) saved the score of the transcriptome file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.score_file):
        xlib.Message.print('error',
                           f'*** The file {args.score_file} does not exist.')
        OK = False

    # check the output_file value
    if args.output_file is None:
        xlib.Message.print(
            'error',
            '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.'
        )
        OK = False
    else:
        try:
            if not os.path.exists(os.path.dirname(args.output_file)):
                os.makedirs(os.path.dirname(args.output_file))
        except Exception as e:
            xlib.Message.print(
                'error',
                f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.'
            )
            OK = False

    # check the minlen value
    if args.minlen is None:
        args.minlen = xlib.Const.DEFAULT_MINLEN
    elif not xlib.check_int(args.minlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The minlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.minlen = int(args.minlen)

    # check the maxlen value
    if args.maxlen is None:
        args.maxlen = xlib.Const.DEFAULT_MAXLEN
    elif not xlib.check_int(args.maxlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The maxlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.maxlen = int(args.maxlen)

    # check the minFPKM value
    if args.minFPKM is None:
        args.minFPKM = xlib.Const.DEFAULT_MINFPKM
    elif not xlib.check_float(args.minFPKM, minimum=0.0):
        print(
            '*** FPKM has to be a float number greater than or equal to 0.0.')
        OK = False
    else:
        args.minFPKM = float(args.minFPKM)

    # check the minTPM value
    if args.minTPM is None:
        args.minTPM = xlib.Const.DEFAULT_MINTPM
    elif not xlib.check_float(args.minTPM, minimum=0.0):
        print(
            '*** FPKM has to be a float number greater than or equal to 0.0.')
        OK = False
    else:
        args.minTPM = float(args.minTPM)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check if maxlen value is greater or equal than minlen value
    if OK:
        if args.maxlen < args.minlen:
            xlib.Message.print(
                'error',
                '*** The maxlen value has to be greater than or equal to minlen.'
            )
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #9
0
def check_cd_hit_est_config_file(strict):
    '''
    check the CD-HIT-EST config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        cd_hit_est_option_dict = xlib.get_option_dict(
            get_cd_hit_est_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in cd_hit_est_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = cd_hit_est_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_software" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_code(assembly_software,
                                     get_assembly_software_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".'
                )
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id,
                                           get_assembly_software_code_list(),
                                           case_sensitive=True):
                error_list.append(
                    f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.'
                )
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = cd_hit_est_option_dict.get(
                'identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append(
                    '*** ERROR: the key "assembly_type" is not found in the section "identification".'
                )
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                error_list.append(
                    f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.'
                )
                OK = False

        # check section "CD-HIT-EST parameters"
        if 'CD-HIT-EST parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "CD-HIT-EST parameters" is not found.')
            OK = False
        else:

            # check section "CD-HIT-EST parameters" - key "threads"
            threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                 {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(threads, minimum=0):
                error_list.append(
                    '*** ERROR: the key "threads" has to be an integer number greater than or equal to 0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "memory_limit"
            memory_limit = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('memory_limit', not_found)
            if memory_limit == not_found:
                error_list.append(
                    '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(memory_limit, minimum=0):
                error_list.append(
                    '*** ERROR: the key "memory_limit" has to be an integer number greater than or equal to 0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "seq_identity_threshold"
            seq_identity_threshold = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('seq_identity_threshold',
                                                 not_found)
            if seq_identity_threshold == not_found:
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_float(
                    seq_identity_threshold, minimum=0., maximum=1.):
                error_list.append(
                    '*** ERROR: the key "seq_identity_threshold" has to be a float number between 0.0 and 1.0.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "word_length"
            word_length = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('word_length', not_found)
            if word_length == not_found:
                error_list.append(
                    '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(word_length, minimum=1):
                error_list.append(
                    '*** ERROR: the key "word_length" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "mask"
            mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                              {}).get('mask', not_found)
            if mask == not_found:
                error_list.append(
                    '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "match"
            match = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                               {}).get('match', not_found)
            if match == not_found:
                error_list.append(
                    '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(match):
                error_list.append(
                    '*** ERROR: the key "match" has to be an integer number.')
                OK = False

            # check section "CD-HIT-EST parameters" - key "mismatch"
            mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters',
                                                  {}).get(
                                                      'mismatch', not_found)
            if mismatch == not_found:
                error_list.append(
                    '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif not xlib.check_int(mismatch):
                error_list.append(
                    '*** ERROR: the key "mismatch" has to be an integer number.'
                )
                OK = False

            # check section "CD-HIT-EST parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch'
            ]
            other_parameters = cd_hit_est_option_dict.get(
                'CD-HIT-EST parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".'
                )
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    other_parameters, "other_parameters",
                    not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_cd_hit_est_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
コード例 #10
0
def check_express_config_file(strict):
    '''
    Check the eXpress config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        express_option_dict = xlib.get_option_dict(get_express_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append('*** ERROR: The option dictionary could not be built from the config file')
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in express_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append('*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = express_option_dict.get('identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".')
                OK = False

            # check section "identification" - key "assembly_software"
            assembly_software = express_option_dict.get('identification', {}).get('assembly_software', not_found)
            if assembly_software == not_found:
                error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".')
                OK = False
            elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.')
                OK = False

            # check section "identification" - key "assembly_dataset_id"
            assembly_dataset_id = express_option_dict.get('identification', {}).get('assembly_dataset_id', not_found)
            if assembly_dataset_id == not_found:
                error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".')
                OK = False
            elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True):
                error_list.append(f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.')
                OK = False

            # check section "identification" - key "assembly_type"
            assembly_type = express_option_dict.get('identification', {}).get('assembly_type', not_found)
            if assembly_type == not_found:
                error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".')
                OK = False
            elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \
                not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE':
                    error_list.append(f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.')
                    OK = False

        # check section "alignment-dataset-1"
        if 'alignment-dataset-1' not in sections_list:
            error_list.append('*** ERROR: the section "alignment-dataset-1" is not found.')
            OK = False

        # check all sections "alignment-dataset-n"
        for section in sections_list:

            if section not in ['identification', 'eXpress parameters']:

                # check than the section identification is like alignment-dataset-n 
                if not re.match('^alignment-dataset-[0-9]+$', section):
                    error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.')
                    OK = False

                else:

                    # check section "alignment-dataset-n" - key "alignment_software"
                    alignment_software = express_option_dict.get(section, {}).get('alignment_software', not_found)
                    if alignment_software == not_found:
                        error_list.append(f'*** ERROR: the key "alignment_software" is not found in the section "{section}".')
                        OK = False
                    elif not xlib.check_code(alignment_software, get_alignment_software_code_list(), case_sensitive=False):
                        error_list.append(f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.')
                        OK = False

                    # check section "alignment-dataset-n" - key "alignment_dataset_id"
                    alignment_dataset_id = express_option_dict.get(section, {}).get('alignment_dataset_id', not_found)
                    if alignment_dataset_id == not_found:
                        error_list.append(f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".')
                        OK = False
                    elif not xlib.check_startswith(alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True):
                        error_list.append(f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.')
                        OK = False

        # check section "eXpress parameters"
        if 'eXpress parameters' not in sections_list:
            error_list.append('*** ERROR: the section "eXpress parameters" is not found.')
            OK = False
        else:

            # check section "express parameters" - key "frag-len-mean"
            frag_len_mean = express_option_dict.get('eXpress parameters', {}).get('frag-len-mean', not_found)
            if frag_len_mean == not_found:
                error_list.append('*** ERROR: the key "frag-len-mean" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(frag_len_mean, minimum=1):
                error_list.append('*** ERROR: the key "frag-len-mean" has to be an integer number greater than or equal to 1.')
                OK = False

            # check section "express parameters" - key "frag-len-stddev"
            frag_len_stddev = express_option_dict.get('eXpress parameters', {}).get('frag-len-stddev', not_found)
            if frag_len_stddev == not_found:
                error_list.append('*** ERROR: the key "frag-len-stddev" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(frag_len_stddev, minimum=1):
                error_list.append('*** ERROR: the key "frag-len-stddev" has to be an integer number greater than or equal to 1.')
                OK = False

            # check section "eXpress parameters" - key "library_type"
            library_type = express_option_dict.get('eXpress parameters', {}).get('library_type', not_found)
            if library_type == not_found:
                error_list.append('*** ERROR: the key "library_type" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(library_type, get_library_type_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "library_type" has to be {get_library_type_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "max-indel-size"
            max_indel_size = express_option_dict.get('eXpress parameters', {}).get('max-indel-size', not_found)
            if max_indel_size == not_found:
                error_list.append('*** ERROR: the key "max-indel-size" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_int(max_indel_size, minimum=0):
                error_list.append('*** ERROR: the key "max-indel-size" has to be an integer number greater than or equal to 0.')
                OK = False

            # check section "eXpress parameters" - key "no-bias-correct"
            no_bias_correct = express_option_dict.get('eXpress parameters', {}).get('no-bias-correct', not_found)
            if no_bias_correct == not_found:
                error_list.append('*** ERROR: the key "no-bias-correct" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(no_bias_correct, get_no_bias_correct_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "no-bias-correct" has to be {get_no_bias_correct_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "no-error-model"
            no_error_model = express_option_dict.get('eXpress parameters', {}).get('no-error-model', not_found)
            if no_error_model == not_found:
                error_list.append('*** ERROR: the key "no-error-model" is not found in the section "eXpress parameters".')
                OK = False
            elif not xlib.check_code(no_error_model, get_no_error_model_code_list(), case_sensitive=False):
                error_list.append(f'*** ERROR: the key "no-error-model" has to be {get_no_error_model_code_list_text()}.')
                OK = False

            # check section "eXpress parameters" - key "other_parameters"
            not_allowed_parameters_list = ['no-update-check', 'frag-len-mean', 'frag-len-stddev', 'max-indel-size', 'fr-stranded', 'rf-stranded', 'f-stranded', 'r-stranded', 'no-bias-correct', 'no-error-model', 'output-dir']
            other_parameters = express_option_dict.get('eXpress parameters', {}).get('other_parameters', not_found)
            if other_parameters == not_found:
                error_list.append('*** ERROR: the key "other_parameters" is not found in the section "eXpress parameters".')
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(f'\nThe {xlib.get_express_name()} config file is not valid. Please, correct this file or recreate it.')

    # return the control variable and the error list
    return (OK, error_list)
コード例 #11
0
def check_args(args):
    '''
    Verity the input arguments data.
    '''

    # initialize the control variable
    OK = True

    # check "fasta_file"
    if args.fasta_file is None:
        xlib.Message.print(
            'error',
            '*** The input FASTA file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.fasta_file):
        xlib.Message.print('error',
                           f'*** The file {args.fasta_file} does not exist.')
        OK = False

    # check the output_file value
    if args.output_file is None:
        xlib.Message.print(
            'error',
            '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.'
        )
        OK = False
    else:
        try:
            if not os.path.exists(os.path.dirname(args.output_file)):
                os.makedirs(os.path.dirname(args.output_file))
        except Exception as e:
            xlib.Message.print('error', f'*** EXCEPTION: "{e}".')
            xlib.Message.print(
                'error',
                f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.'
            )
            OK = False

    # check the minlen value
    if args.minlen is None:
        args.minlen = xlib.Const.DEFAULT_MINLEN
    elif not xlib.check_int(args.minlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The minlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.minlen = int(args.minlen)

    # check the maxlen value
    if args.maxlen is None:
        args.maxlen = xlib.Const.DEFAULT_MAXLEN
    elif not xlib.check_int(args.maxlen, minimum=1):
        xlib.Message.print(
            'error',
            '*** The maxlen has to be a integer number greater than 0.')
        OK = False
    else:
        args.maxlen = int(args.maxlen)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check if maxlen value is greater or equal than minlen value
    if OK:
        if args.maxlen < args.minlen:
            xlib.Message.print(
                'error',
                '*** The maxlen value has to be greater than or equal to minlen.'
            )
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #12
0
def check_fastqc_config_file(strict):
    '''
    Check the FastQC config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in fastqc_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = fastqc_option_dict.get('identification', {}).get(
                'experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "read_dataset_id"
            read_dataset_id = fastqc_option_dict.get('identification', {}).get(
                'read_dataset_id', not_found)
            if read_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "read_dataset_id" is not found in the section "identification".'
                )
                OK = False

        # check section "FastQC parameters"
        if 'FastQC parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "FastQC parameters" is not found.')
            OK = False
        else:

            # check section "FastQC parameters" - key "threads"
            threads = fastqc_option_dict.get('FastQC parameters',
                                             {}).get('threads', not_found)
            if threads == not_found:
                error_list.append(
                    '*** ERROR: the key "threads" is not found in the section "TopHat parameters".'
                )
                OK = False
            elif not xlib.check_int(threads, minimum=1):
                error_list.append(
                    '*** ERROR: the key "threads" has to be an integer number greater than or equal to 1.'
                )
                OK = False

        # check section "file-1"
        if 'file-1' not in sections_list:
            error_list.append('*** ERROR: the section "file-1" is not found.')
            OK = False

        # check all sections "file-n"
        for section in sections_list:

            if section not in ['identification', 'FastQC parameters']:

                # check than the section identification is like file-n
                if not re.match('^file-[0-9]+$', section):
                    error_list.append(
                        f'*** ERROR: the section "{section}" has a wrong identification.'
                    )
                    OK = False

                else:

                    # check section "file-n" - key "file_name"
                    file_name = fastqc_option_dict.get(section, {}).get(
                        'file_name', not_found)
                    if file_name == not_found:
                        error_list.append(
                            f'*** ERROR: the key "file_name" is not found in the section "{section}".'
                        )
                        OK = False
                    elif not xlib.is_valid_path(file_name, 'linux'):
                        error_list.append(
                            f'*** ERROR: the file {file_name} in the key "file_name" of the section "{section}" has a non valid file name.'
                        )
                        OK = False

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_fastqc_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
コード例 #13
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "input_vcf_file"
    if args.input_vcf_file is None:
        xlib.Message.print(
            'error',
            '*** The VCF file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.input_vcf_file):
        xlib.Message.print(
            'error', f'*** The file {args.input_vcf_file} does not exist.')
        OK = False

    # check "sample_file"
    if args.sample_file is None:
        xlib.Message.print(
            'error',
            '*** The sample file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.sample_file):
        xlib.Message.print('error',
                           f'*** The file {args.sample_file} does not exist.')
        OK = False

    # check "fix"
    if args.fix is None:
        xlib.Message.print('error',
                           '*** Fix is not indicated in the input arguments.')
        OK = False
    elif not xlib.check_code(
            args.fix, xlib.get_fix_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** fix has to be {xlib.get_fix_code_list_text()}.')
        OK = False
    else:
        args.fix = args.fix.upper()

    # check "scenario"
    if args.scenario is None:
        xlib.Message.print(
            'error',
            '*** The scenario is not indicated in the input arguments.')
        OK = False
    elif not xlib.check_code(args.scenario,
                             xlib.get_scenario_code_list(),
                             case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The scenario has to be {xlib.get_scenario_code_list_text()}.'
        )
        OK = False

    # check "min_aa_percentage"
    if args.min_aa_percentage is None:
        xlib.Message.print(
            'error',
            '*** The minimum percent of alternative alleles per species is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(
            args.min_aa_percentage, minimum=0.0, maximum=100.0):
        xlib.Message.print(
            'error',
            'The minimum percent of alternative alleles per species has to be a float number between 0.0 and 100.0.'
        )
        OK = False
    else:
        args.min_aa_percentage = float(args.min_aa_percentage)

    # check "min_md_imputation_percentage"
    if args.min_md_imputation_percentage is None:
        xlib.Message.print(
            'error',
            '*** The minimum percentage of missing data imputation to a new alternative allele per species is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(
            args.min_md_imputation_percentage, minimum=0.0, maximum=100.0):
        xlib.Message.print(
            'error',
            'The minimum percentage of missing data imputation to a new alternative allele per species has to be a float number between 0.0 and 100.0.'
        )
        OK = False
    else:
        args.min_md_imputation_percentage = float(
            args.min_md_imputation_percentage)

    # check "imputed_md_id"
    if args.imputed_md_id is None:
        args.imputed_md_id = xlib.Const.DEFAULT_IMPUTED_MD_ID

    # check "sp1_id"
    if args.sp1_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the first species is not indicated in the input arguments.'
        )
        OK = False

    # check "sp1_max_md_percentage"
    if args.sp1_max_md_percentage is None:
        xlib.Message.print(
            'error',
            '*** The maximum percentage of missing data of the first species is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(
            args.sp1_max_md_percentage, minimum=0.0, maximum=100.0):
        xlib.Message.print(
            'error',
            'The maximum percentage of missing data of the first species has to be a float number between 0.0 and 100.0.'
        )
        OK = False
    else:
        args.sp1_max_md_percentage = float(args.sp1_max_md_percentage)

    # check "sp2_id"
    if args.sp2_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the second species is not indicated in the input arguments.'
        )
        OK = False

    # check "sp2_max_md_percentage"
    if args.sp2_max_md_percentage is None:
        xlib.Message.print(
            'error',
            '*** The maximum percentage of missing data of the second species is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(
            args.sp2_max_md_percentage, minimum=0.0, maximum=100.0):
        xlib.Message.print(
            'error',
            'The maximum percentage of missing data of the second species has to be a float number between 0.0 and 100.0.'
        )
        OK = False
    else:
        args.sp2_max_md_percentage = float(args.sp2_max_md_percentage)

    # check "hybrid_id"
    if args.hybrid_id is None:
        args.hybrid_id = 'NONE'

    # check "min_afr_percentage"
    if args.min_afr_percentage is None:
        xlib.Message.print(
            'error',
            '*** The minimum percentage of allele frequency per species is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(
            args.min_afr_percentage, minimum=0.0, maximum=100.0):
        xlib.Message.print(
            'error',
            'The minimum percentage of allele frequency per species has to be a float number between 0.0 and 100.0.'
        )
        OK = False
    else:
        args.min_afr_percentage = float(args.min_afr_percentage)

    # check "min_depth"
    if args.min_depth is None:
        args.min_depth = xlib.Const.DEFAULT_MIN_DEPTH
    elif not xlib.check_int(args.min_depth, minimum=1):
        xlib.Message.print(
            'error',
            'The minimum combined depth across samples has to be an integer number greater than  or equal to 1.'
        )
        OK = False
    else:
        args.min_depth = int(args.min_depth)

    # check "output_vcf_file"
    if args.output_vcf_file is None:
        xlib.Message.print(
            'error',
            '*** The output VCF file is not indicated in the input arguments.')
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check "tvi_list"
    if args.tvi_list is None or args.tvi_list == 'NONE':
        args.tvi_list = []
    else:
        args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list)

    # check the identification set
    if OK:
        if args.sp1_id == args.sp2_id or \
           args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id):
            xlib.Message.print('error',
                               'The identifications must be different.')
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #14
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "ngshelper_database"
    if args.ngshelper_database is None:
        xlib.Message.print(
            'error',
            '*** The NGShelper database is not indicated in the input arguments.'
        )
        OK = False

    # check "sp1_id"
    if args.sp1_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the first species is not indicated in the input arguments.'
        )
        OK = False

    # check "sp2_id"
    if args.sp2_id is None:
        xlib.Message.print(
            'error',
            '*** The identification of the second species is not indicated in the input arguments.'
        )
        OK = False

    # check "hybrid_id"
    if args.hybrid_id is None:
        args.hybrid_id = 'NONE'

    # check "max_separation"
    if args.max_separation is None:
        xlib.Message.print(
            'error',
            '*** The maximum separation between variants of the same intergenic fragment is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.max_separation, minimum=1):
        xlib.Message.print(
            'error',
            'The maximum separation between variants of the same intergenic fragment has to be a integer number greater than 1.'
        )
        OK = False
    else:
        args.max_separation = int(args.max_separation)

    # check "output_dir"
    if args.output_dir is None:
        xlib.Message.print(
            'error',
            '*** The output directy is not indicated in the input arguments.')
        OK = False
    elif not os.path.isdir(args.output_dir):
        xlib.Message.print('error', '*** The output directy does not exist.')
        OK = False

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # check "tsi_list"
    if args.tsi_list is None or args.tsi_list == 'NONE':
        args.tsi_list = []
    else:
        args.tsi_list = xlib.split_literal_to_string_list(args.tsi_list)

    # check the identification set
    if OK:
        if args.sp1_id == args.sp2_id or (args.sp1_id == args.hybrid_id
                                          or args.sp2_id == args.hybrid_id):
            xlib.Message.print('error',
                               'The identifications must be different.')
            OK = False

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #15
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "annotation_file"
    if args.annotation_file is None:
        xlib.Message.print(
            'error',
            '*** The annotation file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.annotation_file):
        xlib.Message.print(
            'error', f'*** The file {args.annotation_file} does not exist.')
        OK = False

    # check "type"
    if args.type is None:
        xlib.Message.print(
            'error',
            '*** The type of annotation file is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_code(
            args.type, xlib.get_type_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.'
        )
        OK = False
    else:
        args.type = args.type.upper()

    # check "header"
    if args.header is None:
        args.header = xlib.Const.DEFAULT_HEADER
    elif not xlib.check_code(
            args.header, xlib.get_header_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** header has to be {xlib.get_header_code_list_text()}.')
        OK = False
    else:
        args.header = args.header.upper()

    # check "record_number_per_file"
    if args.record_number_per_file is None:
        args.record_number_per_file = xlib.Const.DEFAULT_RNUM
    elif not xlib.check_int(args.record_number_per_file, minimum=1):
        xlib.Message.print(
            'error',
            '*** The record number per splitted file has to be an integer number greater than 0.'
        )
        OK = False
    else:
        args.record_number_per_file = int(args.record_number_per_file)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('P001')
コード例 #16
0
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id,
                           output_dir, variant_number_per_file,
                           allele_transformation, tvi_list):
    '''
    Filter and fixes variant data of a VCF file.
    '''

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # initialize the sample species and mother identification lists per variant
    species_id_list = []
    mother_id_list = []

    # initialize the maximum allele number per varaint
    maximum_allele_number = 0

    # initialize allele frequency dictionaries
    allele_frequency_dict_1 = {}
    allele_frequency_dict_2 = {}

    # initialize ATCG conversión dictionary
    # A -> 1; T -> 2; C -> 3; G -> 4
    atcg = 'ATCG'
    atcg_conversion_dict = {}

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException(e, 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith(
                '##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}'
                )
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build ATCG conversion list
            atcg_conversion_list = []
            index = atcg.find(reference_bases.upper())
            if index == -1:
                raise xlib.ProgramException('', 'L016')
            else:
                atcg_conversion_list.append(index + 1)
            for i in range(len(alternative_allele_list)):
                index = atcg.find(alternative_allele_list[i].upper())
                if index == -1:
                    raise xlib.ProgramException('', 'L016')
                else:
                    atcg_conversion_list.append(index + 1)
            atcg_conversion_dict[total_variant_counter] = atcg_conversion_list

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT',
                                            data_dict['chrom'],
                                            data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('L008', 'GT',
                                                data_dict['chrom'],
                                                data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:])

            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'reference_bases: {reference_bases}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace',
                    f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list:
                xlib.Message.print('trace',
                                   f'sample_gt_list: {sample_gt_list}')

            # get the allele counters per species
            allele_counter_dict_1 = {}
            allele_counter_dict_2 = {}
            allele_counter_dict_h = {}
            for i in range(sample_number):
                # only when the sample is an adult
                if mother_id_list[i] == 'NONE':
                    if sample_gt_left_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_left_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_left_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_left_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_left_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_left_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_left_list[i], 0) + 1
                    if sample_gt_right_list[i] != xlib.get_md_symbol():
                        if species_id_list[i] == sp1_id:
                            allele_counter_dict_1[sample_gt_right_list[
                                i]] = allele_counter_dict_1.get(
                                    sample_gt_right_list[i], 0) + 1
                        elif species_id_list[i] == sp2_id:
                            allele_counter_dict_2[sample_gt_right_list[
                                i]] = allele_counter_dict_2.get(
                                    sample_gt_right_list[i], 0) + 1
                        else:
                            allele_counter_dict_h[sample_gt_right_list[
                                i]] = allele_counter_dict_h.get(
                                    sample_gt_right_list[i], 0) + 1
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_1: {allele_counter_dict_1}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_2: {allele_counter_dict_2}')
            if variant_id in tvi_list:
                xlib.Message.print(
                    'trace', f'allele_counter_dict_h: {allele_counter_dict_h}')

            # calculate the maximum allele number
            if maximum_allele_number < len(allele_counter_dict_1.keys()):
                maximum_allele_number = len(allele_counter_dict_1.keys())
            if maximum_allele_number < len(allele_counter_dict_2.keys()):
                maximum_allele_number = len(allele_counter_dict_2.keys())

            # calculate the variant allele frecuencies per species
            allele_frequency_dict_1[total_variant_counter] = {}
            sp1_allele_total = 0
            for allele in allele_counter_dict_1.keys():
                sp1_allele_total += allele_counter_dict_1[allele]
            for allele in allele_counter_dict_1.keys():
                allele_frequency_dict_1[total_variant_counter][
                    allele] = allele_counter_dict_1[allele] / sp1_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}'
                    )
            allele_frequency_dict_2[total_variant_counter] = {}
            sp2_allele_total = 0
            for allele in allele_counter_dict_2.keys():
                sp2_allele_total += allele_counter_dict_2[allele]
            for allele in allele_counter_dict_2.keys():
                allele_frequency_dict_2[total_variant_counter][
                    allele] = allele_counter_dict_2[allele] / sp2_allele_total
                if variant_id in tvi_list:
                    xlib.Message.print(
                        'trace',
                        f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}'
                    )

            # print the counters
            xlib.Message.print(
                'verbose',
                f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}'
            )

            # read the next record of the input VCF file
            (record, key,
             data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')

    # close the VCF file
    vcf_file_id.close()

    # calculate the output SimHyb file number
    simhyb_file_num = math.ceil(total_variant_counter /
                                variant_number_per_file)

    # initialize the begin and end variant
    begin_variant = 1
    end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter

    # write the variant allele frecuencies per species in the output SimHyb files
    for i in range(simhyb_file_num):

        xlib.Message.print(
            'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format(
                begin_variant, end_variant))

        # set the SimHyb file name
        if vcf_file.endswith('.gz'):
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file[:-3]))
        else:
            file_name, file_extension = os.path.splitext(
                os.path.basename(vcf_file))
        if simhyb_file_num == 1:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv'
        else:
            current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv'

        # open the output SimHyb file
        if current_simhyb_file.endswith('.gz'):
            try:
                current_simhyb_file_id = gzip.open(current_simhyb_file,
                                                   mode='wt',
                                                   encoding='iso-8859-1',
                                                   newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F004', current_simhyb_file)
        else:
            try:
                current_simhyb_file_id = open(current_simhyb_file,
                                              mode='w',
                                              encoding='iso-8859-1',
                                              newline='\n')
            except Exception as e:
                raise xlib.ProgramException(e, 'F003', current_simhyb_file)

        # write allele frequency records
        for i in range(maximum_allele_number):

            xlib.Message.print('trace', f'i: {i}')

            # initialize the variable to control the record begin
            is_begin = True

            # species 1
            for j in range(begin_variant, end_variant + 1):

                xlib.Message.print('trace', f'j: {j}')

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_1.get(j, {})

                xlib.Message.print('trace',
                                   f'variant_data_dict: {variant_data_dict}')

                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the sample
                if is_begin:
                    record_part = f'{allele};{allele_frequency}'
                    is_begin = False
                else:
                    record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # species 2
            for j in range(begin_variant, end_variant + 1):

                # get the allele and its frequency
                variant_data_dict = allele_frequency_dict_2.get(j, {})
                if variant_data_dict == {}:
                    allele = 0
                    allele_frequency = 0
                else:
                    allele_list = sorted(variant_data_dict.keys())
                    if i < len(allele_list):
                        allele = allele_list[i]
                        allele_frequency = variant_data_dict[allele]
                        if allele_transformation == 'ADD100' and xlib.check_int(
                                allele):
                            allele = int(allele) + 100
                        elif allele_transformation == 'ATCG':
                            allele = atcg_conversion_dict[j][int(allele)]
                    else:
                        allele = 0
                        allele_frequency = 0

                # write the part of this record corresponding with the variant
                record_part = f';{allele};{allele_frequency}'
                current_simhyb_file_id.write(record_part)

            # write the end of the record
            current_simhyb_file_id.write('\n')

        # close SymHyb file
        current_simhyb_file_id.close()

        # print OK message
        xlib.Message.print(
            'info',
            f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.'
        )

        # set the new begin and end variant
        begin_variant = end_variant + 1
        end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
コード例 #17
0
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list):
    '''
    Load data of a VCF file.
    '''

    # get the sample data
    sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id)

    # drop table "vcf_samples" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n')
    xsqlite.drop_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n')
    xsqlite.create_vcf_samples(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # insert samples data into table "vcf_samples"
    xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n')
    for key, value in sample_dict.items():
        value['type'] = 'N/A'
        xsqlite.insert_vcf_samples_row(conn, value)
    xlib.Message.print('verbose', 'Data are inserted.\n')
     
    # create index "vcf_samples_index" with columns "dataset_id" and "gene_id"  (if not exists)
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n')
    xsqlite.create_vcf_samples_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # get the sample type dictionary
    sample_type_dict = xsqlite.get_sample_type_dict(conn)

    # update the type of each sample
    for key in sample_type_dict.keys():
        xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type'])

    # drop table "vcf_variants" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n')
    xsqlite.drop_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n')
    xsqlite.drop_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # drop table "vcf_samples_alleles" (if it exists)
    xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n')
    xsqlite.drop_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is droped.\n')

    # create table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles(conn)
    xlib.Message.print('verbose', 'The table is created.\n')

    # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles"
    vcf_variants_row_dict = {}
    vcf_alleles_row_dict = {}
    vcf_samples_alleles_row_dict = {}

    # build the list of imputed and missing data alleles
    M_I_list = [imputed_md_id, xlib.get_md_symbol()]

    # initialize the sample number
    sample_number = 0

    # initialize counters
    input_record_counter = 0
    total_variant_counter = 0
    vcf_variants_inserted_row_counter = 0
    vcf_alleles_inserted_row_counter = 0
    vcf_samples_alleles_inserted_row_counter = 0

    # initialize the sample species and mother identification lists per variant
    sample_id_list = []
    species_id_list = []
    mother_id_list = []

    # open the input VCF file
    if vcf_file.endswith('.gz'):
        try:
            vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F002', vcf_file)
    else:
        try:
            vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1')
        except Exception as e:
            raise xlib.ProgramException(e, 'F001', vcf_file)

    # read the first record of input VCF file
    (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    # while there are records in input VCF file
    while record != '':

        # process metadata records
        while record != '' and record.startswith('##'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process the column description record
        if record.startswith('#CHROM'):

            # add 1 to the read sequence counter
            input_record_counter += 1

            # get the record data list
            record_data_list = data_dict['record_data_list']

            # build the sample species and mother identification lists per variant
            for i in range(9, len(record_data_list)):
                try:
                    sample_id = record_data_list[i]
                    species_id = sample_dict[record_data_list[i]]['species_id']
                    mother_id = sample_dict[record_data_list[i]]['mother_id']
                except Exception as e:
                    raise xlib.ProgramException(e, 'L002', record_data_list[i])
                sample_id_list.append(sample_id)
                species_id_list.append(species_id)
                mother_id_list.append(mother_id)

            # check if the sample species list is empty
            if species_id_list == []:
                raise xlib.ProgramException('', 'L003')

            # set the sample number
            sample_number = len(species_id_list)

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

        # process variant record
        while record != '' and not record.startswith('##') and not record.startswith('#CHROM'):

            # add set the variant identification
            variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}'

            # add 1 to the read sequence counter
            input_record_counter += 1

            # add 1 to the total variant counter
            total_variant_counter += 1

            if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}')

            # get the reference bases (field REF) and alternative alleles (field ALT)
            reference_bases = data_dict['ref']
            alternative_alleles = data_dict['alt']

            # build the alternative alleles list from field ALT
            alternative_allele_list = data_dict['alt'].split(',')

            # build the alleles list from reference bases and alternative alleles list
            if alternative_alleles == xlib.get_md_symbol():
                alleles_list = [reference_bases]
            else:
                alleles_list = [reference_bases] + alternative_allele_list

            # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A
            variant_type = ''
            if alternative_alleles == xlib.get_md_symbol():
                variant_type = 'N/A'
            else:
                is_indel = False
                if len(reference_bases) > 1:
                    is_indel = True
                else:
                    for alternative_allele in alternative_allele_list:
                        if len(alternative_allele) > 1:
                            is_indel = True
                            break
                if is_indel:
                    variant_type = 'INDEL'
                elif len(alternative_allele_list) > 1:
                    variant_type = 'MULTIALLELIC'
                else:
                    variant_type = 'SNP'

            # get the position of the genotype (subfield GT) in the field FORMAT
            format_subfield_list = data_dict['format'].upper().split(':')
            try:
                gt_position = format_subfield_list.index('GT')
            except Exception as e:
                raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos'])

            # build the list of sample genotypes of a variant
            sample_gt_list = []
            for i in range(sample_number):
                sample_data_list = data_dict['sample_list'][i].split(':')
                sample_gt_list.append(sample_data_list[gt_position])

            # build the lists of the left and right side of sample genotypes of a variant
            sample_gt_left_list = []
            sample_gt_right_list = []
            for i in range(sample_number):
                sep = '/'
                sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    sep = '|'
                    sep_pos = sample_gt_list[i].find(sep)
                if sep_pos == -1:
                    raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos'])
                sample_gt_left_list.append(sample_gt_list[i][:sep_pos])
                sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:])

            if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}')
            if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}')

            # set data and insert row into the table "vcf_variants"
            vcf_variants_row_dict['variant_id'] = variant_id
            vcf_variants_row_dict['seq_id'] = data_dict['chrom']
            vcf_variants_row_dict['position'] = data_dict['pos']
            vcf_variants_row_dict['reference_bases'] = reference_bases
            vcf_variants_row_dict['alternative_alleles'] = alternative_alleles
            vcf_variants_row_dict['variant_type'] = variant_type
            xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict)
            vcf_variants_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_alleles"
            vcf_alleles_row_dict['variant_id'] = variant_id
            # reference bases and alternative alleles
            for j in range(len(alleles_list)):
                vcf_alleles_row_dict['allele_id'] = str(j)
                vcf_alleles_row_dict['bases'] = alleles_list[j]
                if xlib.check_int(j) and allele_transformation == 'ADD100':
                    structure_allele_id = str(int(j) + 100)
                else:
                    structure_allele_id = j
                vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
                xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
                vcf_alleles_inserted_row_counter += 1
            # missing data
            vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(new_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(new_md_id) + 100)
            else:
                structure_allele_id = new_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1
            # imputed missing data
            vcf_alleles_row_dict['allele_id'] = imputed_md_id
            vcf_alleles_row_dict['bases'] = 'N/D'
            if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100':
                structure_allele_id = str(int(imputed_md_id) + 100)
            else:
                structure_allele_id = imputed_md_id
            vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id
            xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict)
            vcf_alleles_inserted_row_counter += 1

            # set data and insert rows into the table "vcf_samples_alleles"
            vcf_samples_alleles_row_dict['variant_id'] = variant_id
            for i in range(sample_number):
                vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i]

                # initialize genotype distribution dictionary
                genotype_distribution_dict = {}
                for j in range(len(alleles_list)):
                    genotype_distribution_dict[alleles_list[j]] = 0
                for j in range(len(M_I_list)):
                    genotype_distribution_dict[M_I_list[j]] = 0

                # calculate genotype distribution dictionary
                if sample_gt_left_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_left_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1
                if sample_gt_right_list[i] in M_I_list:
                    genotype_distribution_dict[sample_gt_right_list[i]] += 1
                else:
                    genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1

                # calculate precuency and insert rows for reference bases and alternative alleles
                for j in range(len(alleles_list)):
                    if genotype_distribution_dict[alleles_list[j]] > 0:
                        # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j]
                        vcf_samples_alleles_row_dict['allele_id'] = j
                        vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2
                        xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                        vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for imputed missing data
                if genotype_distribution_dict[imputed_md_id] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

                # calculate precuency and insert rows for missing data
                if genotype_distribution_dict[xlib.get_md_symbol()] > 0:
                    vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol()
                    vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2
                    xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict)
                    vcf_samples_alleles_inserted_row_counter += 1

            # print the counters
            xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}')

            # read the next record of the input VCF file
            (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number)

    xlib.Message.print('verbose', '\n')
     
    # create the index "vcf_variants_index" on the table "vcf_variants"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n')
    xsqlite.create_vcf_variants_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_alleles_index" on the table "vcf_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n')
    xsqlite.create_vcf_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')
     
    # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles"
    xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n')
    xsqlite.create_vcf_samples_alleles_index(conn)
    xlib.Message.print('verbose', 'The index is created.\n')

    # save changes into NGShelper database
    xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n')
    conn.commit()
    xlib.Message.print('verbose', 'Changes are saved.\n')

    # close the VCF file
    vcf_file_id.close()
コード例 #18
0
def check_args(args):
    '''
    Verity the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "fasta_file"
    if args.fasta_file is None:
        xlib.Message.print(
            'error',
            '*** The input FASTA file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.fasta_file):
        xlib.Message.print('error',
                           f'*** The file {args.fasta_file} does not exist.')
        OK = False

    # check "output_file"
    if args.output_file is None:
        xlib.Message.print(
            'error',
            '*** The FASTA file with debased sequences is not indicated in the input arguments.'
        )
        OK = False

    # check "fragmentation_probability"
    if args.fragmentation_probability is None:
        xlib.Message.print(
            'error',
            '*** The fragmentation probability is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(args.fragmentation_probability,
                              minimum=xlib.Const.FRAGPROB_LOWEST,
                              maximum=xlib.Const.FRAGPROB_UPPEST):
        xlib.Message.print(
            'error',
            f'The fragmentation probability has to be a float number between {xlib.Const.FRAGPROB_LOWEST} and {xlib.Const.FRAGPROB_UPPEST}.'
        )
        OK = False
    else:
        args.fragmentation_probability = float(args.fragmentation_probability)

    # check "max_fragment_number"
    if args.max_fragment_number is None:
        xlib.Message.print(
            'error',
            '*** The maximum fragment number is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.max_fragment_number,
                            minimum=xlib.Const.MAXFRAGNUM_LOWEST,
                            maximum=xlib.Const.MAXFRAGNUM_UPPEST):
        xlib.Message.print(
            'error',
            f'The maximum fragment number has to be a integer number between {xlib.Const.MAXFRAGNUM_LOWEST} and {xlib.Const.MAXFRAGNUM_UPPEST}.'
        )
        OK = False
    else:
        args.max_fragment_number = int(args.max_fragment_number)

    # check "max_end_shortening"
    if args.max_end_shortening is None:
        xlib.Message.print(
            'error',
            '*** The maximum shortening of a fragment end is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.max_end_shortening,
                            minimum=xlib.Const.MAXSHORTENING_LOWEST,
                            maximum=xlib.Const.MAXSHORTENING_UPPEST):
        xlib.Message.print(
            'error',
            f'The maximum shortening of a fragment end has to be a integer number between {xlib.Const.MAXSHORTENING_LOWEST} and {xlib.Const.MAXSHORTENING_UPPEST}.'
        )
        OK = False
    else:
        args.max_end_shortening = int(args.max_end_shortening)

    # check "min_fragment_length"
    if args.min_fragment_length is None:
        xlib.Message.print(
            'error',
            '*** The minimum fragment length is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.min_fragment_length, minimum=1):
        xlib.Message.print(
            'error',
            'The minimum fragment length has to be a integer number greater than 0.'
        )
        OK = False
    else:
        args.min_fragment_length = int(args.min_fragment_length)

    # check "mutation_probability"
    if args.mutation_probability is None:
        xlib.Message.print(
            'error',
            '*** The mutation probability is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(args.mutation_probability,
                              minimum=xlib.Const.MUTPROB_LOWEST,
                              maximum=xlib.Const.MUTPROB_UPPEST):
        xlib.Message.print(
            'error',
            f'The mutation probability has to be a float number between {xlib.Const.MUTPROB_LOWEST} and {xlib.Const.MUTPROB_UPPEST}'
        )
        OK = False
    else:
        args.mutation_probability = float(args.mutation_probability)

    # check "max_mutation_number"
    if args.max_mutation_number is None:
        xlib.Message.print(
            'error',
            '*** The maximum mutation number is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.max_mutation_number,
                            minimum=xlib.Const.MAXMUTNUM_LOWEST,
                            maximum=xlib.Const.MAXMUTNUM_UPPEST):
        xlib.Message.print(
            'error',
            f'The maximum mutation number has to be a integer number between {xlib.Const.MAXMUTNUM_LOWEST} and {xlib.Const.MAXMUTNUM_UPPEST}.'
        )
        OK = False
    else:
        args.max_mutation_number = int(args.max_mutation_number)

    # check "indel_probability"
    if args.indel_probability is None:
        xlib.Message.print(
            'error',
            '*** The insertion/deletion probability is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_float(args.indel_probability,
                              minimum=xlib.Const.INDELPROB_LOWEST,
                              maximum=xlib.Const.INDELPROB_UPPEST):
        xlib.Message.print(
            'error',
            f'The insertion/deletion probability has to be a float number between {xlib.Const.INDELPROB_LOWEST} and {xlib.Const.INDELPROB_UPPEST}.'
        )
        OK = False
    else:
        args.indel_probability = float(args.indel_probability)

    # check "max_mutation_size"
    if args.max_mutation_size is None:
        xlib.Message.print(
            'error',
            '*** The maximum mutation size size is not indicated in the input arguments.'
        )
        OK = False
    elif not xlib.check_int(args.max_mutation_size,
                            minimum=xlib.Const.MAXMUTSIZE_LOWEST,
                            maximum=xlib.Const.MAXMUTSIZE_UPPEST):
        xlib.Message.print(
            'error',
            f'The maximum mutation size size has to be a integer number between {xlib.Const.MAXMUTSIZE_LOWEST} and {xlib.Const.MAXMUTSIZE_UPPEST}.'
        )
        OK = False
    else:
        args.max_mutation_size = int(args.max_mutation_size)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif args.verbose.upper() not in get_verbose_code_list():
        xlib.Message.print(
            'error', f'The verbose has to be {get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif args.trace.upper() not in get_trace_code_list():
        xlib.Message.print(
            'error', f'The trace has to be {get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')
コード例 #19
0
ファイル: xhtseq.py プロジェクト: GGFHF/NGScloud2
def check_htseq_count_config_file(strict):
    '''
    Check the htseq-count config file of a run.
    '''

    # initialize the control variable and the error list
    OK = True
    error_list = []

    # intitialize variable used when value is not found
    not_found = '***NOTFOUND***'.upper()

    # get the option dictionary
    try:
        htseq_count_option_dict = xlib.get_option_dict(
            get_htseq_count_config_file())
    except Exception as e:
        error_list.append(f'*** EXCEPTION: "{e}".')
        error_list.append(
            '*** ERROR: The option dictionary could not be built from the config file'
        )
        OK = False
    else:

        # get the sections list
        sections_list = []
        for section in htseq_count_option_dict.keys():
            sections_list.append(section)
        sections_list.sort()

        # check section "identification"
        if 'identification' not in sections_list:
            error_list.append(
                '*** ERROR: the section "identification" is not found.')
            OK = False
        else:

            # check section "identification" - key "experiment_id"
            experiment_id = htseq_count_option_dict.get(
                'identification', {}).get('experiment_id', not_found)
            if experiment_id == not_found:
                error_list.append(
                    '*** ERROR: the key "experiment_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "reference_dataset_id"
            reference_dataset_id = htseq_count_option_dict.get(
                'identification', {}).get('reference_dataset_id', not_found)
            if reference_dataset_id == not_found:
                error_list.append(
                    '*** ERROR: the key "reference_dataset_id" is not found in the section "identification".'
                )
                OK = False

            # check section "identification" - key "annotation_file"
            annotation_file = htseq_count_option_dict.get(
                'identification', {}).get('annotation_file', not_found)
            if annotation_file == not_found:
                error_list.append(
                    '*** ERROR: the key "annotation_file" is not found in the section "identification".'
                )
                OK = False
            elif os.path.splitext(annotation_file)[1] not in ['.gtf', '.gff']:
                error_list.append(
                    '*** ERROR: the key "annotation_file" has to be a file name with .gtf/.gff extension.'
                )
                OK = False

        # check section "alignment-dataset-1"
        if 'alignment-dataset-1' not in sections_list:
            error_list.append(
                '*** ERROR: the section "alignment-dataset-1" is not found.')
            OK = False

        # check all sections "alignment-dataset-n"
        for section in sections_list:

            if section not in ['identification', 'htseq-count parameters']:

                # check than the section identification is like alignment-dataset-n
                if not re.match('^alignment-dataset-[0-9]+$', section):
                    error_list.append(
                        f'*** ERROR: the section "{section}" has a wrong identification.'
                    )
                    OK = False

                else:

                    # check section "alignment-dataset-n" - key "alignment_software"
                    alignment_software = htseq_count_option_dict.get(
                        section, {}).get('alignment_software', not_found)
                    if alignment_software == not_found:
                        error_list.append(
                            f'*** ERROR: the key "alignment_software" is not found in the section "{section}".'
                        )
                        OK = False
                    elif not xlib.check_code(
                            alignment_software,
                            get_alignment_software_code_list(),
                            case_sensitive=False):
                        error_list.append(
                            f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.'
                        )
                        OK = False

                    # check section "alignment-dataset-n" - key "alignment_dataset_id"
                    alignment_dataset_id = htseq_count_option_dict.get(
                        section, {}).get('alignment_dataset_id', not_found)
                    if alignment_dataset_id == not_found:
                        error_list.append(
                            f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".'
                        )
                        OK = False
                    elif not xlib.check_startswith(
                            alignment_dataset_id,
                            get_alignment_software_code_list(),
                            case_sensitive=True):
                        error_list.append(
                            f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.'
                        )
                        OK = False

        # check section "htseq-count parameters"
        if 'htseq-count parameters' not in sections_list:
            error_list.append(
                '*** ERROR: the section "htseq-count parameters" is not found.'
            )
            OK = False
        else:

            # check section "htseq-count parameters" - key "nprocesses"
            nprocesses = htseq_count_option_dict.get(
                'htseq-count parameters', {}).get('nprocesses', not_found)
            if nprocesses == not_found:
                error_list.append(
                    '*** ERROR: the key "nprocesses" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_int(nprocesses, minimum=1):
                error_list.append(
                    '*** ERROR: the key "nprocesses" has to be an integer number greater than or equal to 1.'
                )
                OK = False

            # check section "htseq-count parameters" - key "stranded"
            stranded = htseq_count_option_dict.get('htseq-count parameters',
                                                   {}).get(
                                                       'stranded', not_found)
            if stranded == not_found:
                error_list.append(
                    '*** ERROR: the key "stranded" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    stranded, get_stranded_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "stranded" has to be {get_stranded_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "minaqual"
            minaqual = htseq_count_option_dict.get('htseq-count parameters',
                                                   {}).get(
                                                       'minaqual', not_found)
            if minaqual == not_found:
                error_list.append(
                    '*** ERROR: the key "minaqual" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_int(minaqual):
                error_list.append(
                    '*** ERROR: the key "minaqual" has to be an integer number.'
                )
                OK = False

            # check section "htseq-count parameters" - key "type"
            type = htseq_count_option_dict.get('htseq-count parameters',
                                               {}).get('type', not_found)
            if type == not_found:
                error_list.append(
                    '*** ERROR: the key "type" is not found in the section "htseq-count parameters".'
                )
                OK = False

            # check section "htseq-count parameters" - key "idattr"
            idattr = htseq_count_option_dict.get('htseq-count parameters',
                                                 {}).get('idattr', not_found)
            if idattr == not_found:
                error_list.append(
                    '*** ERROR: the key "idattr" is not found in the section "htseq-count parameters".'
                )
                OK = False

            # check section "htseq-count parameters" - key "mode"
            mode = htseq_count_option_dict.get('htseq-count parameters',
                                               {}).get('mode', not_found)
            if mode == not_found:
                error_list.append(
                    '*** ERROR: the key "mode" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(
                    mode, get_mode_code_list(), case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "nonunique"
            nonunique = htseq_count_option_dict.get('htseq-count parameters',
                                                    {}).get(
                                                        'nonunique', not_found)
            if nonunique == not_found:
                error_list.append(
                    '*** ERROR: the key "nonunique" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif not xlib.check_code(nonunique,
                                     get_nonunique_code_list(),
                                     case_sensitive=False):
                error_list.append(
                    f'*** ERROR: the key "nonunique" has to be {get_nonunique_code_list_text()}.'
                )
                OK = False

            # check section "htseq-count parameters" - key "other_parameters"
            not_allowed_parameters_list = [
                'nprocesses', 'format', 'stranded', 'minaqual', 'type',
                'idattr', 'mode', 'nonunique', 'quiet'
            ]
            other_parameters = htseq_count_option_dict.get(
                'htseq-count parameters', {}).get('other_parameters',
                                                  not_found)
            if other_parameters == not_found:
                error_list.append(
                    '*** ERROR: the key "other_parameters" is not found in the section "htseq-count parameters".'
                )
                OK = False
            elif other_parameters.upper() != 'NONE':
                (OK, error_list2) = xlib.check_parameter_list(
                    other_parameters, "other_parameters",
                    not_allowed_parameters_list)
                error_list = error_list + error_list2

    # warn that the results config file is not valid if there are any errors
    if not OK:
        error_list.append(
            f'\nThe {xlib.get_htseq_count_name()} config file is not valid. Please, correct this file or recreate it.'
        )

    # return the control variable and the error list
    return (OK, error_list)
コード例 #20
0
def check_args(args):
    '''
    Check the input arguments.
    '''

    # initialize the control variable
    OK = True

    # check "vcf_file"
    if args.vcf_file is None:
        xlib.Message.print(
            'error',
            '*** The input VCF file is not indicated in the input arguments.')
        OK = False
    elif not os.path.isfile(args.vcf_file):
        xlib.Message.print('error',
                           f'*** The file {args.vcf_file} does not exist.')
        OK = False

    # check "genome_file"
    if args.genome_file is None:
        xlib.Message.print(
            'error',
            '*** The FASTA genome file is not indicated in the input arguments.'
        )
        OK = False
    elif not os.path.isfile(args.genome_file):
        xlib.Message.print('error',
                           f'*** The file {args.genome_file} does not exist.')
        OK = False

    # check "flanking_region_file"
    if args.flanking_region_file is None:
        xlib.Message.print(
            'error',
            '*** The FASTA file with flanking regions is not indicated in the input arguments.'
        )
        OK = False

    # check "nucleotide_number"
    if args.nucleotide_number is None:
        args.nucleotide_number = xlib.Const.DEFAULT_NUCLEOTIDE_NUMBER
    elif not xlib.check_int(args.nucleotide_number, minimum=1):
        xlib.Message.print(
            'error',
            'The minimum combined depth across samples has to be an integer number greater than  or equal to 1.'
        )
        OK = False
    else:
        args.nucleotide_number = int(args.nucleotide_number)

    # check "verbose"
    if args.verbose is None:
        args.verbose = xlib.Const.DEFAULT_VERBOSE
    elif not xlib.check_code(
            args.verbose, xlib.get_verbose_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error',
            f'*** verbose has to be {xlib.get_verbose_code_list_text()}.')
        OK = False
    if args.verbose.upper() == 'Y':
        xlib.Message.set_verbose_status(True)

    # check "trace"
    if args.trace is None:
        args.trace = xlib.Const.DEFAULT_TRACE
    elif not xlib.check_code(
            args.trace, xlib.get_trace_code_list(), case_sensitive=False):
        xlib.Message.print(
            'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.')
        OK = False
    if args.trace.upper() == 'Y':
        xlib.Message.set_trace_status(True)

    # if there are errors, exit with exception
    if not OK:
        raise xlib.ProgramException('', 'P001')