def input_int(text, default=None, minimum=(-sys.maxsize - 1), maximum=sys.maxsize): ''' Input a integer number. ''' # initialize the number literal = None # input and check the integer number while literal is None: if default is None: literal = input(f'{text}: ') else: literal = input(f'{text} [{default}]: ') if literal == '': literal = default if not xlib.check_int(literal, minimum, maximum): print(f'*** ERROR: {literal} is not a valid value.') literal = None # return the integer value return int(literal)
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "vcf_file" if args.vcf_file is None: xlib.Message.print( 'error', '*** The VCF file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.vcf_file): xlib.Message.print('error', f'*** The file {args.vcf_file} does not exist.') OK = False # check "sample_file" if args.sample_file is None: xlib.Message.print( 'error', '*** The sample file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.sample_file): xlib.Message.print('error', f'*** The file {args.sample_file} does not exist.') OK = False # check "sp1_id" if args.sp1_id is None: xlib.Message.print( 'error', '*** The identification of the first species is not indicated in the input arguments.' ) OK = False # check "sp2_id" if args.sp2_id is None: xlib.Message.print( 'error', '*** The identification of the second species is not indicated in the input arguments.' ) OK = False # check "hybrid_id" if args.hybrid_id is None: args.hybrid_id = 'NONE' # check "output_dir" if args.output_dir is None: xlib.Message.print( 'error', '*** The output directy is not indicated in the input arguments.') OK = False elif not os.path.isdir(args.output_dir): xlib.Message.print('error', '*** The output directy does not exist.') OK = False # check "variant_number_per_file" if args.variant_number_per_file is None: args.variant_number_per_file = xlib.Const.DEFAULT_VARIANT_NUMBER_PER_FILE elif not xlib.check_int(args.variant_number_per_file, minimum=1): xlib.Message.print( 'error', 'The variant number per file has to be an integer number greater than 0.' ) OK = False else: args.variant_number_per_file = int(args.variant_number_per_file) # check "allele_transformation" if args.allele_transformation is None: args.allele_transformation = 'NONE' elif not xlib.check_code(args.allele_transformation, xlib.get_allele_transformation_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.' ) OK = False else: args.allele_transformation = args.allele_transformation.upper() # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check "tvi_list" if args.tvi_list is None or args.tvi_list == 'NONE': args.tvi_list = [] else: args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list) # check the identification set if OK: if args.sp1_id == args.sp2_id or \ args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id): xlib.Message.print('error', 'The identifications must be different.') OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "simhyb_file" if args.simhyb_file is None: xlib.Message.print( 'error', '*** The SimHyb file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.simhyb_file): xlib.Message.print('error', f'*** The file {args.simhyb_file} does not exist.') OK = False # check "header_row_number" if args.header_row_number is None: xlib.Message.print( 'error', '*** The header row number in the SimHyb file is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.header_row_number, minimum=0): xlib.Message.print( 'error', 'The header row number in the SimHyb file has to be an integer number greater than or equalt to 0.' ) OK = False else: args.header_row_number = int(args.header_row_number) # check "structure_file" if args.structure_file is None: xlib.Message.print( 'error', '*** The converted Structure file is not indicated in the input arguments.' ) OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def convert_vcf_to_structure(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, structure_file_type, output_converted_file, tvi_list): ''' Convert a VCF file to the Structure input formats. ''' # initialize the sample number sample_number = 0 # initialize the sample information list sample_info_list = [] # initialize the variant code list variant_code_list = [] # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes gt_left_matrix = [] gt_right_matrix = [] # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # open the VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # initialize counters record_counter = 0 variant_counter = 0 # read the first record of VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample information list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) if species_id == sp1_id: numeric_species_id = 1 elif species_id == sp2_id: numeric_species_id = 2 else: numeric_species_id = 3 sample_info_list.append( [record_data_list[i], numeric_species_id]) # check if the sample information list is empty if sample_info_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(sample_info_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant records while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the VCF record counter record_counter += 1 # add 1 to the variant counter variant_counter += 1 # append variant code to the variant code list and write the code and its sequence identification and position in the variant file id = f'{data_dict["chrom"]}-{data_dict["pos"]}' variant_code_list.append(id) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) if variant_id in tvi_list: xlib.Message.print('trace', f'(4) sample_gt_list: {sample_gt_list}') # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) try: if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol(): sample_gt_left_list.append(new_md_id) else: sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) if sample_gt_list[i][sep_pos + 1:] == xlib.get_md_symbol(): sample_gt_right_list.append(new_md_id) else: sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) except Exception as e: raise xlib.ProgramException(e, 'L008', 'GT', data_dict['chrom'], data_dict['pos']) # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes gt_left_matrix.append(sample_gt_left_list) gt_right_matrix.append(sample_gt_right_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close the VCF file vcf_file_id.close() # review the imputed missing data when the type of the converted file is 1 if structure_file_type == '1': # detect variants with any imputed missing data excluded_variant_index_list = [] for i in range(len(gt_left_matrix)): for j in range(sample_number): if gt_left_matrix[i][j] == imputed_md_id or gt_right_matrix[i][ j] == imputed_md_id: excluded_variant_index_list.append(i) break xlib.Message.print( 'trace', 'excluded_variant_index_list: {}'.format( excluded_variant_index_list)) # remove data of variants with any imputed missing data excluded_variant_index_list.reverse() for k in excluded_variant_index_list: variant_code_list.pop(k) gt_left_matrix.pop(k) gt_right_matrix.pop(k) # open the output converted file if output_converted_file.endswith('.gz'): try: output_converted_file_id = gzip.open(output_converted_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_converted_file) else: try: output_converted_file_id = open(output_converted_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_converted_file) # write header record variant_code_list_text = '\t'.join(variant_code_list) output_converted_file_id.write( f'sample_id\tspecies_id\t{variant_code_list_text}\n') # write sample records for i in range(sample_number): # build left and right side lists of variants of a sample sample_variant_gt_left_list = [] sample_variant_gt_right_list = [] for j in range(len(gt_left_matrix)): # left if xlib.check_int(gt_left_matrix[j] [i]) and allele_transformation == 'ADD100': allele_left = str(int(gt_left_matrix[j][i]) + 100) else: allele_left = gt_left_matrix[j][i] sample_variant_gt_left_list.append(allele_left) # right if xlib.check_int(gt_right_matrix[j] [i]) and allele_transformation == 'ADD100': allele_right = str(int(gt_right_matrix[j][i]) + 100) else: allele_right = gt_right_matrix[j][i] sample_variant_gt_right_list.append(allele_right) # write the first record of the sample sample_variant_gt_left_list_text = '\t'.join( sample_variant_gt_left_list) output_converted_file_id.write( f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_left_list_text}\n' ) # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_left_list)}\n') # write the second record of the sample sample_variant_gt_right_list_text = '\t'.join( sample_variant_gt_right_list) output_converted_file_id.write( f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_right_list_text}\n' ) # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_right_list)}\n') # close file output_converted_file_id.close() # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(output_converted_file)} is created.' )
def check_busco_config_file(strict): ''' Check the BUSCO config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: busco_option_dict = xlib.get_option_dict(get_busco_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in busco_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = busco_option_dict.get('identification', {}).get( 'experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = busco_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = busco_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = busco_option_dict.get('identification', {}).get( 'assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "BUSCO parameters" if 'BUSCO parameters' not in sections_list: error_list.append( '*** ERROR: the section "BUSCO parameters" is not found.') OK = False else: # check section "BUSCO parameters" - key "ncpu" ncpu = busco_option_dict.get('BUSCO parameters', {}).get('ncpu', not_found) if ncpu == not_found: error_list.append( '*** ERROR: the key "ncpu" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(ncpu, minimum=1): error_list.append( '*** ERROR: the key "ncpu" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "lineage_data_url" lineage_data_url = busco_option_dict.get( 'BUSCO parameters', {}).get('lineage_data_url', not_found) if lineage_data_url == not_found: error_list.append( '*** ERROR: the key "lineage_data_url" is not found in the section "BUSCO parameters"' ) OK = False else: try: urllib.request.urlopen(lineage_data_url) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: the key "lineage_data_url" has to be a reachable address.' ) OK = False # check section "BUSCO parameters" - key "mode" mode = busco_option_dict.get('BUSCO parameters', {}).get('mode', not_found) if mode == not_found: error_list.append( '*** ERROR: the key "mode" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( mode, get_mode_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "evalue" evalue = busco_option_dict.get('BUSCO parameters', {}).get('evalue', not_found) if evalue == not_found: error_list.append( '*** ERROR: the key "evalue" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_float(evalue, minimum=0., mne=1E-12): error_list.append( '*** ERROR: the key "evalue" has to be a float number greater than 0.' ) OK = False # check section "BUSCO parameters" - key "limit" limit = busco_option_dict.get('BUSCO parameters', {}).get('limit', not_found) if limit == not_found: error_list.append( '*** ERROR: the key "limit" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_int(limit, minimum=1): error_list.append( '*** ERROR: the key "limit" has to be an integer number greater than or equal to 1.' ) OK = False # check section "BUSCO parameters" - key "species" species = busco_option_dict.get('BUSCO parameters', {}).get('species', not_found) if species == not_found: error_list.append( '*** ERROR: the key "species" is not found in the section "BUSCO parameters"' ) OK = False # check section "BUSCO parameters" - key "long" long = busco_option_dict.get('BUSCO parameters', {}).get('long', not_found) if long == not_found: error_list.append( '*** ERROR: the key "long" is not found in the section "BUSCO parameters".' ) OK = False elif not xlib.check_code( long, get_long_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "long" has to be {get_long_code_list_text()}.' ) OK = False # check section "BUSCO parameters" - key "augustus_options" augustus_options = busco_option_dict.get( 'BUSCO parameters', {}).get('augustus_options', not_found) if augustus_options == not_found: error_list.append( '*** ERROR: the key "augustus_options" is not found in the section "BUSCO parameters".' ) OK = False elif augustus_options.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( augustus_options, "augustus_options", []) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_busco_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def convert_vcf_to_phase_input(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, allele_transformation, output_dir, tvi_list): ''' Convert a VCF file to the PHASE input format. ''' # initialize the sample number sample_number = 0 # initialize the sample information list sample_info_list = [] # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample species identification list per variant species_id_list = [] # open the VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # initialize counters seq_counter = 0 variant_counter = 0 record_counter = 0 # read the first record of VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample information list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) if species_id == sp1_id: numeric_species_id = 1 elif species_id == sp2_id: numeric_species_id = 2 else: numeric_species_id = 3 sample_info_list.append( [record_data_list[i], numeric_species_id]) # build the sample species list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant records while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the sequence counter seq_counter += 1 # initialize VCF record counter variant_counter = 0 # save the sequence old_seq = data_dict['chrom'] # initialize the list of variant positions variant_position_list = [] # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes gt_left_matrix = [] gt_right_matrix = [] # initialize the list of the variant multiallelic status variant_multiallelic_status_list = [] # process variant records of the same sequence while record != '' and not record.startswith( '##') and not record.startswith( '#CHROM') and data_dict['chrom'] == old_seq: # add 1 to the VCF record counter record_counter += 1 # add 1 to the total variant counter variant_counter += 1 # append position to the list of variant positions variant_position_list.append(data_dict['pos']) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) # get the allele counters per species allele_counter_dict = {} for i in range(sample_number): if sample_gt_left_list[i] != xlib.get_md_symbol(): allele_counter_dict[ sample_gt_left_list[i]] = allele_counter_dict.get( sample_gt_left_list[i], 0) + 1 if sample_gt_right_list[i] != xlib.get_md_symbol(): allele_counter_dict[ sample_gt_right_list[i]] = allele_counter_dict.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict: {allele_counter_dict}') # check if the variant is multiallelic if len(allele_counter_dict.keys()) > 2: variant_multiallelic_status = 'M' else: variant_multiallelic_status = 'S' if variant_id in tvi_list: xlib.Message.print( 'trace', f'variant_multiallelic_status: {variant_multiallelic_status}.' ) # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes gt_left_matrix.append(sample_gt_left_list) gt_right_matrix.append(sample_gt_right_list) # append to the list of the variant multiallelic status variant_multiallelic_status_list.append( variant_multiallelic_status) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # set output converted file of the sequence if vcf_file.endswith('.gz'): file_name, file_extension = os.path.splitext( os.path.basename(vcf_file[:-3])) else: file_name, file_extension = os.path.splitext( os.path.basename(vcf_file)) seq_output_converted_file = f'{output_dir}/{file_name}-2phase-{old_seq}.txt' # open the output converted file if seq_output_converted_file.endswith('.gz'): try: seq_output_converted_file_id = gzip.open( seq_output_converted_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', seq_output_converted_file) else: try: seq_output_converted_file_id = open( seq_output_converted_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', seq_output_converted_file) # write header records header_record_1 = f'{sample_number}\n' seq_output_converted_file_id.write(header_record_1) header_record_2 = f'{len(variant_position_list)}\n' seq_output_converted_file_id.write(header_record_2) header_record_3 = f'P {" ".join(variant_position_list)}\n' seq_output_converted_file_id.write(header_record_3) header_record_4 = f'{"".join(variant_multiallelic_status_list)}\n' seq_output_converted_file_id.write(header_record_4) # write sample records for i in range(sample_number): # build left and right side lists of variants of a sample sample_variant_gt_left_list = [] sample_variant_gt_right_list = [] for j in range(len(variant_position_list)): # left if gt_left_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'S': allele_left = '?' elif gt_left_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'M': allele_left = '-1' elif xlib.check_int( gt_left_matrix[j] [i]) and allele_transformation == 'ADD100': allele_left = str(int(gt_left_matrix[j][i]) + 100) else: allele_left = gt_left_matrix[j][i] sample_variant_gt_left_list.append(allele_left) # right if gt_right_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'S': allele_right = '?' elif gt_right_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'M': allele_right = '-1' elif xlib.check_int( gt_right_matrix[j] [i]) and allele_transformation == 'ADD100': allele_right = str(int(gt_right_matrix[j][i]) + 100) else: allele_right = gt_right_matrix[j][i] sample_variant_gt_right_list.append(allele_right) # write the first record of the sample sample_record_1 = f'#{sample_info_list[i][0]}\n' seq_output_converted_file_id.write(sample_record_1) # write the second record of the sample sample_record_2 = f'{" ".join(sample_variant_gt_left_list)}\n' seq_output_converted_file_id.write(sample_record_2) # write the third record of the sample sample_record_3 = f'{" ".join(sample_variant_gt_right_list)}\n' seq_output_converted_file_id.write(sample_record_3) # close file seq_output_converted_file_id.close() xlib.Message.print('verbose', '\n') # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(seq_output_converted_file)} is created.' ) # close VCF file vcf_file_id.close()
def check_cutadapt_config_file(strict): ''' Check the cutadapt config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cutadapt_option_dict = xlib.get_option_dict(get_cutadapt_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The option dictionary could not be built from the config file') OK = False else: # get the sections list sections_list = [] for section in cutadapt_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cutadapt_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "read_dataset_id" read_dataset_id = cutadapt_option_dict.get('identification', {}).get('read_dataset_id', not_found) if read_dataset_id == not_found: error_list.append('*** ERROR: the key "read_dataset_id" is not found in the section "identification".') OK = False # check section "cutadapt parameters" if 'cutadapt parameters' not in sections_list: error_list.append('*** ERROR: the section "cutadapt parameters" is not found.') OK = False else: # check section "cutadapt parameters" - key "cores" cores = cutadapt_option_dict.get('cutadapt parameters', {}).get('cores', not_found) if cores == not_found: error_list.append('*** ERROR: the key "cores" is not found in the section "cutadapt parameters".') OK = False elif not xlib.check_int(cores, minimum=0): error_list.append('*** ERROR: the key "cores" has to be an integer number greater than or equal to 0.') OK = False # check section "cutadapt parameters" - key "adapter" adapter = cutadapt_option_dict.get('cutadapt parameters', {}).get('adapter', not_found) if adapter == not_found: error_list.append('*** ERROR: the key "adapter" is not found in the section "cutadapt parameters".') OK = False elif adapter.upper() == 'NONE': error_list.append('*** ERROR: the key "adapter" has to be different from NONE.') OK = False # check section "cutadapt parameters" - key "adapter_pe" adapter_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('adapter_pe', not_found) is_ok_adapter_pe = False if adapter_pe == not_found: error_list.append('*** ERROR: the key "adapter_pe" is not found in the section "cutadapt parameters".') OK = False else: is_ok_adapter_pe = True # check section "cutadapt parameters" - key "front" front = cutadapt_option_dict.get('cutadapt parameters', {}).get('front', not_found) if front == not_found: error_list.append('*** ERROR: the key "front" is not found in the section "cutadapt parameters".') OK = False # check section "cutadapt parameters" - key "front_pe" front_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('front_pe', not_found) is_ok_front_pe = False if front_pe == not_found: error_list.append('*** ERROR: the key "front_pe" is not found in the section "cutadapt parameters".') OK = False else: is_ok_front_pe = True # check section "cutadapt parameters" - key "anywhere" anywhere = cutadapt_option_dict.get('cutadapt parameters', {}).get('anywhere', not_found) if anywhere == not_found: error_list.append('*** ERROR: the key "anywhere" is not found in the section "cutadapt parameters".') OK = False # check section "cutadapt parameters" - key "anywhere_pe" anywhere_pe = cutadapt_option_dict.get('cutadapt parameters', {}).get('anywhere_pe', not_found) is_ok_anywhere_pe = False if anywhere_pe == not_found: error_list.append('*** ERROR: the key "anywhere_pe" is not found in the section "cutadapt parameters".') OK = False else: is_ok_anywhere_pe = True # check section "cutadapt parameters" - key "other_parameters" not_allowed_parameters_list = ['cores', 'adapter', 'front', 'anywhere'] other_parameters = cutadapt_option_dict.get('cutadapt parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "cutadapt parameters".') OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # check section "library" if 'library' not in sections_list: error_list.append('*** ERROR: the section "library" is not found.') OK = False else: # check section "library" - key "format" format = cutadapt_option_dict.get('library', {}).get('format', not_found) if format == not_found: error_list.append('*** ERROR: the key "format" is not found in the section "library".') OK = False elif not xlib.check_code(format, get_format_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "format" has to be {get_format_code_list_text()}.') OK = False # check section "library" - key "read_type" read_type = cutadapt_option_dict.get('library', {}).get('read_type', not_found) is_ok_read_type = False if read_type == not_found: error_list.append('*** ERROR: the key "read_type" is not found in the section "library".') OK = False elif not xlib.check_code(read_type, get_read_type_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "read_type" has to be {get_read_type_code_list_text()}.') OK = False else: is_ok_read_type = True # check "adapter_pe" is NONE if read type es SE if is_ok_read_type and is_ok_adapter_pe and read_type.upper() == 'SE' and adapter_pe.upper() != 'NONE': error_list.append('*** ERROR: the key "adapter_pe" has to be NONE when de read type is SE.') OK = False # check "front_pe" is NONE if read type es SE if is_ok_read_type and is_ok_front_pe and read_type.upper() == 'SE' and front_pe.upper() != 'NONE': error_list.append('*** ERROR: the key "front_pe" has to be NONE when de read type is SE.') OK = False # check "anywhere_pe" is NONE if read type es SE if is_ok_read_type and is_ok_anywhere_pe and read_type.upper() == 'SE' and anywhere_pe.upper() != 'NONE': error_list.append('*** ERROR: the key "anywhere_pe" has to be NONE when de read type is SE.') OK = False # check section "library-1" if 'library-1' not in sections_list: error_list.append('*** ERROR: the section "library-1" is not found.') OK = False # check all sections "library-n" for section in sections_list: if section not in ['identification', 'cutadapt parameters', 'library']: # check than the section identification is like library-n if not re.match('^library-[0-9]+$', section): error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.') OK = False else: # check section "library-n" - key "read_file_1" read_file_1 = cutadapt_option_dict.get(section, {}).get('read_file_1', not_found) if read_file_1 == not_found: error_list.append(f'*** ERROR: the key "read_file_1" is not found in the section "{section}"') OK = False # check section "library-n" - key "read_file_2" read_file_2 = cutadapt_option_dict.get(section, {}).get('read_file_2', not_found) if read_file_2 == not_found: error_list.append(f'*** ERROR: the key "read_file_2" is not found in the section "{section}"') OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append(f'\nThe {xlib.get_kallisto_name()} config file is not valid. Please, correct this file or recreate it.') # return the control variable and the error list return (OK, error_list)
def check_args(args): ''' Verity the input arguments data. ''' # initialize the control variable OK = True # check the assembly_software_code value if args.assembly_software_code is None: xlib.Message.print( 'error', '*** The assembly software that generated the transcritpme file is not indicated in the input arguments.' ) OK = False elif args.assembly_software_code not in [ xlib.Const.AS_TRINITY_CODE, xlib.Const.AS_SOAPDENOVOTRANS_CODE, xlib.Const.AS_GENERATED_BY_NGSCLOUD ]: xlib.Message.print( 'error', f'*** {args.assembly_software_code} is not a valid code of assembly software.' ) OK = False # check the transcriptome_file value if args.transcriptome_file is None: xlib.Message.print( 'error', '*** A transcritpme file in Fasta format is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.transcriptome_file): xlib.Message.print( 'error', f'*** The file {args.transcriptome_file} does not exist.') OK = False # check the score_file value if args.score_file is None: xlib.Message.print( 'error', '*** A score file where RSEM-EVAL (DETONATE package) saved the score of the transcriptome file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.score_file): xlib.Message.print('error', f'*** The file {args.score_file} does not exist.') OK = False # check the output_file value if args.output_file is None: xlib.Message.print( 'error', '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.' ) OK = False else: try: if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file)) except Exception as e: xlib.Message.print( 'error', f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.' ) OK = False # check the minlen value if args.minlen is None: args.minlen = xlib.Const.DEFAULT_MINLEN elif not xlib.check_int(args.minlen, minimum=1): xlib.Message.print( 'error', '*** The minlen has to be a integer number greater than 0.') OK = False else: args.minlen = int(args.minlen) # check the maxlen value if args.maxlen is None: args.maxlen = xlib.Const.DEFAULT_MAXLEN elif not xlib.check_int(args.maxlen, minimum=1): xlib.Message.print( 'error', '*** The maxlen has to be a integer number greater than 0.') OK = False else: args.maxlen = int(args.maxlen) # check the minFPKM value if args.minFPKM is None: args.minFPKM = xlib.Const.DEFAULT_MINFPKM elif not xlib.check_float(args.minFPKM, minimum=0.0): print( '*** FPKM has to be a float number greater than or equal to 0.0.') OK = False else: args.minFPKM = float(args.minFPKM) # check the minTPM value if args.minTPM is None: args.minTPM = xlib.Const.DEFAULT_MINTPM elif not xlib.check_float(args.minTPM, minimum=0.0): print( '*** FPKM has to be a float number greater than or equal to 0.0.') OK = False else: args.minTPM = float(args.minTPM) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check if maxlen value is greater or equal than minlen value if OK: if args.maxlen < args.minlen: xlib.Message.print( 'error', '*** The maxlen value has to be greater than or equal to minlen.' ) OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_cd_hit_est_config_file(strict): ''' check the CD-HIT-EST config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: cd_hit_est_option_dict = xlib.get_option_dict( get_cd_hit_est_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in cd_hit_est_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = cd_hit_est_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "assembly_software" assembly_software = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append( '*** ERROR: the key "assembly_software" is not found in the section "identification".' ) OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append( '*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".' ) OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.' ) OK = False # check section "identification" - key "assembly_type" assembly_type = cd_hit_est_option_dict.get( 'identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append( '*** ERROR: the key "assembly_type" is not found in the section "identification".' ) OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append( f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.' ) OK = False # check section "CD-HIT-EST parameters" if 'CD-HIT-EST parameters' not in sections_list: error_list.append( '*** ERROR: the section "CD-HIT-EST parameters" is not found.') OK = False else: # check section "CD-HIT-EST parameters" - key "threads" threads = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(threads, minimum=0): error_list.append( '*** ERROR: the key "threads" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "memory_limit" memory_limit = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('memory_limit', not_found) if memory_limit == not_found: error_list.append( '*** ERROR: the key "memory_limit" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(memory_limit, minimum=0): error_list.append( '*** ERROR: the key "memory_limit" has to be an integer number greater than or equal to 0.' ) OK = False # check section "CD-HIT-EST parameters" - key "seq_identity_threshold" seq_identity_threshold = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('seq_identity_threshold', not_found) if seq_identity_threshold == not_found: error_list.append( '*** ERROR: the key "seq_identity_threshold" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_float( seq_identity_threshold, minimum=0., maximum=1.): error_list.append( '*** ERROR: the key "seq_identity_threshold" has to be a float number between 0.0 and 1.0.' ) OK = False # check section "CD-HIT-EST parameters" - key "word_length" word_length = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('word_length', not_found) if word_length == not_found: error_list.append( '*** ERROR: the key "word_length" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(word_length, minimum=1): error_list.append( '*** ERROR: the key "word_length" has to be an integer number greater than or equal to 1.' ) OK = False # check section "CD-HIT-EST parameters" - key "mask" mask = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('mask', not_found) if mask == not_found: error_list.append( '*** ERROR: the key "mask" is not found in the section "CD-HIT-EST parameters".' ) OK = False # check section "CD-HIT-EST parameters" - key "match" match = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get('match', not_found) if match == not_found: error_list.append( '*** ERROR: the key "match" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(match): error_list.append( '*** ERROR: the key "match" has to be an integer number.') OK = False # check section "CD-HIT-EST parameters" - key "mismatch" mismatch = cd_hit_est_option_dict.get('CD-HIT-EST parameters', {}).get( 'mismatch', not_found) if mismatch == not_found: error_list.append( '*** ERROR: the key "mismatch" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif not xlib.check_int(mismatch): error_list.append( '*** ERROR: the key "mismatch" has to be an integer number.' ) OK = False # check section "CD-HIT-EST parameters" - key "other_parameters" not_allowed_parameters_list = [ 'T', 'M', 'c', 'n', 'mask', 'match', 'mismatch' ] other_parameters = cd_hit_est_option_dict.get( 'CD-HIT-EST parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "CD-HIT-EST parameters".' ) OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_cd_hit_est_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def check_express_config_file(strict): ''' Check the eXpress config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: express_option_dict = xlib.get_option_dict(get_express_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append('*** ERROR: The option dictionary could not be built from the config file') OK = False else: # get the sections list sections_list = [] for section in express_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append('*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = express_option_dict.get('identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append('*** ERROR: the key "experiment_id" is not found in the section "identification".') OK = False # check section "identification" - key "assembly_software" assembly_software = express_option_dict.get('identification', {}).get('assembly_software', not_found) if assembly_software == not_found: error_list.append('*** ERROR: the key "assembly_software" is not found in the section "identification".') OK = False elif not xlib.check_code(assembly_software, get_assembly_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "assembly_software" has to be {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_dataset_id" assembly_dataset_id = express_option_dict.get('identification', {}).get('assembly_dataset_id', not_found) if assembly_dataset_id == not_found: error_list.append('*** ERROR: the key "assembly_dataset_id" is not found in the section "identification".') OK = False elif not xlib.check_startswith(assembly_dataset_id, get_assembly_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "assembly_dataset_id" has to start with {get_assembly_software_code_list_text()}.') OK = False # check section "identification" - key "assembly_type" assembly_type = express_option_dict.get('identification', {}).get('assembly_type', not_found) if assembly_type == not_found: error_list.append('*** ERROR: the key "assembly_type" is not found in the section "identification".') OK = False elif assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() not in ['CONTIGS', 'SCAFFOLDS'] or \ not assembly_dataset_id.startswith(xlib.get_soapdenovotrans_code()) and assembly_type.upper() != 'NONE': error_list.append(f'*** ERROR: the key "assembly_type" has to be CONTIGS or SCAFFOLDS in {xlib.get_soapdenovotrans_name()} or NONE in any other case.') OK = False # check section "alignment-dataset-1" if 'alignment-dataset-1' not in sections_list: error_list.append('*** ERROR: the section "alignment-dataset-1" is not found.') OK = False # check all sections "alignment-dataset-n" for section in sections_list: if section not in ['identification', 'eXpress parameters']: # check than the section identification is like alignment-dataset-n if not re.match('^alignment-dataset-[0-9]+$', section): error_list.append(f'*** ERROR: the section "{section}" has a wrong identification.') OK = False else: # check section "alignment-dataset-n" - key "alignment_software" alignment_software = express_option_dict.get(section, {}).get('alignment_software', not_found) if alignment_software == not_found: error_list.append(f'*** ERROR: the key "alignment_software" is not found in the section "{section}".') OK = False elif not xlib.check_code(alignment_software, get_alignment_software_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.') OK = False # check section "alignment-dataset-n" - key "alignment_dataset_id" alignment_dataset_id = express_option_dict.get(section, {}).get('alignment_dataset_id', not_found) if alignment_dataset_id == not_found: error_list.append(f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".') OK = False elif not xlib.check_startswith(alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True): error_list.append(f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.') OK = False # check section "eXpress parameters" if 'eXpress parameters' not in sections_list: error_list.append('*** ERROR: the section "eXpress parameters" is not found.') OK = False else: # check section "express parameters" - key "frag-len-mean" frag_len_mean = express_option_dict.get('eXpress parameters', {}).get('frag-len-mean', not_found) if frag_len_mean == not_found: error_list.append('*** ERROR: the key "frag-len-mean" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_mean, minimum=1): error_list.append('*** ERROR: the key "frag-len-mean" has to be an integer number greater than or equal to 1.') OK = False # check section "express parameters" - key "frag-len-stddev" frag_len_stddev = express_option_dict.get('eXpress parameters', {}).get('frag-len-stddev', not_found) if frag_len_stddev == not_found: error_list.append('*** ERROR: the key "frag-len-stddev" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(frag_len_stddev, minimum=1): error_list.append('*** ERROR: the key "frag-len-stddev" has to be an integer number greater than or equal to 1.') OK = False # check section "eXpress parameters" - key "library_type" library_type = express_option_dict.get('eXpress parameters', {}).get('library_type', not_found) if library_type == not_found: error_list.append('*** ERROR: the key "library_type" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(library_type, get_library_type_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "library_type" has to be {get_library_type_code_list_text()}.') OK = False # check section "eXpress parameters" - key "max-indel-size" max_indel_size = express_option_dict.get('eXpress parameters', {}).get('max-indel-size', not_found) if max_indel_size == not_found: error_list.append('*** ERROR: the key "max-indel-size" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_int(max_indel_size, minimum=0): error_list.append('*** ERROR: the key "max-indel-size" has to be an integer number greater than or equal to 0.') OK = False # check section "eXpress parameters" - key "no-bias-correct" no_bias_correct = express_option_dict.get('eXpress parameters', {}).get('no-bias-correct', not_found) if no_bias_correct == not_found: error_list.append('*** ERROR: the key "no-bias-correct" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_bias_correct, get_no_bias_correct_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-bias-correct" has to be {get_no_bias_correct_code_list_text()}.') OK = False # check section "eXpress parameters" - key "no-error-model" no_error_model = express_option_dict.get('eXpress parameters', {}).get('no-error-model', not_found) if no_error_model == not_found: error_list.append('*** ERROR: the key "no-error-model" is not found in the section "eXpress parameters".') OK = False elif not xlib.check_code(no_error_model, get_no_error_model_code_list(), case_sensitive=False): error_list.append(f'*** ERROR: the key "no-error-model" has to be {get_no_error_model_code_list_text()}.') OK = False # check section "eXpress parameters" - key "other_parameters" not_allowed_parameters_list = ['no-update-check', 'frag-len-mean', 'frag-len-stddev', 'max-indel-size', 'fr-stranded', 'rf-stranded', 'f-stranded', 'r-stranded', 'no-bias-correct', 'no-error-model', 'output-dir'] other_parameters = express_option_dict.get('eXpress parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append('*** ERROR: the key "other_parameters" is not found in the section "eXpress parameters".') OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list(other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append(f'\nThe {xlib.get_express_name()} config file is not valid. Please, correct this file or recreate it.') # return the control variable and the error list return (OK, error_list)
def check_args(args): ''' Verity the input arguments data. ''' # initialize the control variable OK = True # check "fasta_file" if args.fasta_file is None: xlib.Message.print( 'error', '*** The input FASTA file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.fasta_file): xlib.Message.print('error', f'*** The file {args.fasta_file} does not exist.') OK = False # check the output_file value if args.output_file is None: xlib.Message.print( 'error', '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.' ) OK = False else: try: if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file)) except Exception as e: xlib.Message.print('error', f'*** EXCEPTION: "{e}".') xlib.Message.print( 'error', f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.' ) OK = False # check the minlen value if args.minlen is None: args.minlen = xlib.Const.DEFAULT_MINLEN elif not xlib.check_int(args.minlen, minimum=1): xlib.Message.print( 'error', '*** The minlen has to be a integer number greater than 0.') OK = False else: args.minlen = int(args.minlen) # check the maxlen value if args.maxlen is None: args.maxlen = xlib.Const.DEFAULT_MAXLEN elif not xlib.check_int(args.maxlen, minimum=1): xlib.Message.print( 'error', '*** The maxlen has to be a integer number greater than 0.') OK = False else: args.maxlen = int(args.maxlen) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check if maxlen value is greater or equal than minlen value if OK: if args.maxlen < args.minlen: xlib.Message.print( 'error', '*** The maxlen value has to be greater than or equal to minlen.' ) OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_fastqc_config_file(strict): ''' Check the FastQC config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: fastqc_option_dict = xlib.get_option_dict(get_fastqc_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in fastqc_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = fastqc_option_dict.get('identification', {}).get( 'experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "read_dataset_id" read_dataset_id = fastqc_option_dict.get('identification', {}).get( 'read_dataset_id', not_found) if read_dataset_id == not_found: error_list.append( '*** ERROR: the key "read_dataset_id" is not found in the section "identification".' ) OK = False # check section "FastQC parameters" if 'FastQC parameters' not in sections_list: error_list.append( '*** ERROR: the section "FastQC parameters" is not found.') OK = False else: # check section "FastQC parameters" - key "threads" threads = fastqc_option_dict.get('FastQC parameters', {}).get('threads', not_found) if threads == not_found: error_list.append( '*** ERROR: the key "threads" is not found in the section "TopHat parameters".' ) OK = False elif not xlib.check_int(threads, minimum=1): error_list.append( '*** ERROR: the key "threads" has to be an integer number greater than or equal to 1.' ) OK = False # check section "file-1" if 'file-1' not in sections_list: error_list.append('*** ERROR: the section "file-1" is not found.') OK = False # check all sections "file-n" for section in sections_list: if section not in ['identification', 'FastQC parameters']: # check than the section identification is like file-n if not re.match('^file-[0-9]+$', section): error_list.append( f'*** ERROR: the section "{section}" has a wrong identification.' ) OK = False else: # check section "file-n" - key "file_name" file_name = fastqc_option_dict.get(section, {}).get( 'file_name', not_found) if file_name == not_found: error_list.append( f'*** ERROR: the key "file_name" is not found in the section "{section}".' ) OK = False elif not xlib.is_valid_path(file_name, 'linux'): error_list.append( f'*** ERROR: the file {file_name} in the key "file_name" of the section "{section}" has a non valid file name.' ) OK = False # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_fastqc_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "input_vcf_file" if args.input_vcf_file is None: xlib.Message.print( 'error', '*** The VCF file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.input_vcf_file): xlib.Message.print( 'error', f'*** The file {args.input_vcf_file} does not exist.') OK = False # check "sample_file" if args.sample_file is None: xlib.Message.print( 'error', '*** The sample file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.sample_file): xlib.Message.print('error', f'*** The file {args.sample_file} does not exist.') OK = False # check "fix" if args.fix is None: xlib.Message.print('error', '*** Fix is not indicated in the input arguments.') OK = False elif not xlib.check_code( args.fix, xlib.get_fix_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** fix has to be {xlib.get_fix_code_list_text()}.') OK = False else: args.fix = args.fix.upper() # check "scenario" if args.scenario is None: xlib.Message.print( 'error', '*** The scenario is not indicated in the input arguments.') OK = False elif not xlib.check_code(args.scenario, xlib.get_scenario_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The scenario has to be {xlib.get_scenario_code_list_text()}.' ) OK = False # check "min_aa_percentage" if args.min_aa_percentage is None: xlib.Message.print( 'error', '*** The minimum percent of alternative alleles per species is not indicated in the input arguments.' ) OK = False elif not xlib.check_float( args.min_aa_percentage, minimum=0.0, maximum=100.0): xlib.Message.print( 'error', 'The minimum percent of alternative alleles per species has to be a float number between 0.0 and 100.0.' ) OK = False else: args.min_aa_percentage = float(args.min_aa_percentage) # check "min_md_imputation_percentage" if args.min_md_imputation_percentage is None: xlib.Message.print( 'error', '*** The minimum percentage of missing data imputation to a new alternative allele per species is not indicated in the input arguments.' ) OK = False elif not xlib.check_float( args.min_md_imputation_percentage, minimum=0.0, maximum=100.0): xlib.Message.print( 'error', 'The minimum percentage of missing data imputation to a new alternative allele per species has to be a float number between 0.0 and 100.0.' ) OK = False else: args.min_md_imputation_percentage = float( args.min_md_imputation_percentage) # check "imputed_md_id" if args.imputed_md_id is None: args.imputed_md_id = xlib.Const.DEFAULT_IMPUTED_MD_ID # check "sp1_id" if args.sp1_id is None: xlib.Message.print( 'error', '*** The identification of the first species is not indicated in the input arguments.' ) OK = False # check "sp1_max_md_percentage" if args.sp1_max_md_percentage is None: xlib.Message.print( 'error', '*** The maximum percentage of missing data of the first species is not indicated in the input arguments.' ) OK = False elif not xlib.check_float( args.sp1_max_md_percentage, minimum=0.0, maximum=100.0): xlib.Message.print( 'error', 'The maximum percentage of missing data of the first species has to be a float number between 0.0 and 100.0.' ) OK = False else: args.sp1_max_md_percentage = float(args.sp1_max_md_percentage) # check "sp2_id" if args.sp2_id is None: xlib.Message.print( 'error', '*** The identification of the second species is not indicated in the input arguments.' ) OK = False # check "sp2_max_md_percentage" if args.sp2_max_md_percentage is None: xlib.Message.print( 'error', '*** The maximum percentage of missing data of the second species is not indicated in the input arguments.' ) OK = False elif not xlib.check_float( args.sp2_max_md_percentage, minimum=0.0, maximum=100.0): xlib.Message.print( 'error', 'The maximum percentage of missing data of the second species has to be a float number between 0.0 and 100.0.' ) OK = False else: args.sp2_max_md_percentage = float(args.sp2_max_md_percentage) # check "hybrid_id" if args.hybrid_id is None: args.hybrid_id = 'NONE' # check "min_afr_percentage" if args.min_afr_percentage is None: xlib.Message.print( 'error', '*** The minimum percentage of allele frequency per species is not indicated in the input arguments.' ) OK = False elif not xlib.check_float( args.min_afr_percentage, minimum=0.0, maximum=100.0): xlib.Message.print( 'error', 'The minimum percentage of allele frequency per species has to be a float number between 0.0 and 100.0.' ) OK = False else: args.min_afr_percentage = float(args.min_afr_percentage) # check "min_depth" if args.min_depth is None: args.min_depth = xlib.Const.DEFAULT_MIN_DEPTH elif not xlib.check_int(args.min_depth, minimum=1): xlib.Message.print( 'error', 'The minimum combined depth across samples has to be an integer number greater than or equal to 1.' ) OK = False else: args.min_depth = int(args.min_depth) # check "output_vcf_file" if args.output_vcf_file is None: xlib.Message.print( 'error', '*** The output VCF file is not indicated in the input arguments.') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check "tvi_list" if args.tvi_list is None or args.tvi_list == 'NONE': args.tvi_list = [] else: args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list) # check the identification set if OK: if args.sp1_id == args.sp2_id or \ args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id): xlib.Message.print('error', 'The identifications must be different.') OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "ngshelper_database" if args.ngshelper_database is None: xlib.Message.print( 'error', '*** The NGShelper database is not indicated in the input arguments.' ) OK = False # check "sp1_id" if args.sp1_id is None: xlib.Message.print( 'error', '*** The identification of the first species is not indicated in the input arguments.' ) OK = False # check "sp2_id" if args.sp2_id is None: xlib.Message.print( 'error', '*** The identification of the second species is not indicated in the input arguments.' ) OK = False # check "hybrid_id" if args.hybrid_id is None: args.hybrid_id = 'NONE' # check "max_separation" if args.max_separation is None: xlib.Message.print( 'error', '*** The maximum separation between variants of the same intergenic fragment is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.max_separation, minimum=1): xlib.Message.print( 'error', 'The maximum separation between variants of the same intergenic fragment has to be a integer number greater than 1.' ) OK = False else: args.max_separation = int(args.max_separation) # check "output_dir" if args.output_dir is None: xlib.Message.print( 'error', '*** The output directy is not indicated in the input arguments.') OK = False elif not os.path.isdir(args.output_dir): xlib.Message.print('error', '*** The output directy does not exist.') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check "tsi_list" if args.tsi_list is None or args.tsi_list == 'NONE': args.tsi_list = [] else: args.tsi_list = xlib.split_literal_to_string_list(args.tsi_list) # check the identification set if OK: if args.sp1_id == args.sp2_id or (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id): xlib.Message.print('error', 'The identifications must be different.') OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "annotation_file" if args.annotation_file is None: xlib.Message.print( 'error', '*** The annotation file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.annotation_file): xlib.Message.print( 'error', f'*** The file {args.annotation_file} does not exist.') OK = False # check "type" if args.type is None: xlib.Message.print( 'error', '*** The type of annotation file is not indicated in the input arguments.' ) OK = False elif not xlib.check_code( args.type, xlib.get_type_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.' ) OK = False else: args.type = args.type.upper() # check "header" if args.header is None: args.header = xlib.Const.DEFAULT_HEADER elif not xlib.check_code( args.header, xlib.get_header_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** header has to be {xlib.get_header_code_list_text()}.') OK = False else: args.header = args.header.upper() # check "record_number_per_file" if args.record_number_per_file is None: args.record_number_per_file = xlib.Const.DEFAULT_RNUM elif not xlib.check_int(args.record_number_per_file, minimum=1): xlib.Message.print( 'error', '*** The record number per splitted file has to be an integer number greater than 0.' ) OK = False else: args.record_number_per_file = int(args.record_number_per_file) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('P001')
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, output_dir, variant_number_per_file, allele_transformation, tvi_list): ''' Filter and fixes variant data of a VCF file. ''' # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample species and mother identification lists per variant species_id_list = [] mother_id_list = [] # initialize the maximum allele number per varaint maximum_allele_number = 0 # initialize allele frequency dictionaries allele_frequency_dict_1 = {} allele_frequency_dict_2 = {} # initialize ATCG conversión dictionary # A -> 1; T -> 2; C -> 3; G -> 4 atcg = 'ATCG' atcg_conversion_dict = {} # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException(e, 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) if variant_id in tvi_list: xlib.Message.print( 'trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build ATCG conversion list atcg_conversion_list = [] index = atcg.find(reference_bases.upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) for i in range(len(alternative_allele_list)): index = atcg.find(alternative_allele_list[i].upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) atcg_conversion_dict[total_variant_counter] = atcg_conversion_list # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # get the allele counters per species allele_counter_dict_1 = {} allele_counter_dict_2 = {} allele_counter_dict_h = {} for i in range(sample_number): # only when the sample is an adult if mother_id_list[i] == 'NONE': if sample_gt_left_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_left_list[ i]] = allele_counter_dict_1.get( sample_gt_left_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_left_list[ i]] = allele_counter_dict_2.get( sample_gt_left_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_left_list[ i]] = allele_counter_dict_h.get( sample_gt_left_list[i], 0) + 1 if sample_gt_right_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_right_list[ i]] = allele_counter_dict_1.get( sample_gt_right_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_right_list[ i]] = allele_counter_dict_2.get( sample_gt_right_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_right_list[ i]] = allele_counter_dict_h.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_1: {allele_counter_dict_1}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_2: {allele_counter_dict_2}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_h: {allele_counter_dict_h}') # calculate the maximum allele number if maximum_allele_number < len(allele_counter_dict_1.keys()): maximum_allele_number = len(allele_counter_dict_1.keys()) if maximum_allele_number < len(allele_counter_dict_2.keys()): maximum_allele_number = len(allele_counter_dict_2.keys()) # calculate the variant allele frecuencies per species allele_frequency_dict_1[total_variant_counter] = {} sp1_allele_total = 0 for allele in allele_counter_dict_1.keys(): sp1_allele_total += allele_counter_dict_1[allele] for allele in allele_counter_dict_1.keys(): allele_frequency_dict_1[total_variant_counter][ allele] = allele_counter_dict_1[allele] / sp1_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}' ) allele_frequency_dict_2[total_variant_counter] = {} sp2_allele_total = 0 for allele in allele_counter_dict_2.keys(): sp2_allele_total += allele_counter_dict_2[allele] for allele in allele_counter_dict_2.keys(): allele_frequency_dict_2[total_variant_counter][ allele] = allele_counter_dict_2[allele] / sp2_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close the VCF file vcf_file_id.close() # calculate the output SimHyb file number simhyb_file_num = math.ceil(total_variant_counter / variant_number_per_file) # initialize the begin and end variant begin_variant = 1 end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter # write the variant allele frecuencies per species in the output SimHyb files for i in range(simhyb_file_num): xlib.Message.print( 'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format( begin_variant, end_variant)) # set the SimHyb file name if vcf_file.endswith('.gz'): file_name, file_extension = os.path.splitext( os.path.basename(vcf_file[:-3])) else: file_name, file_extension = os.path.splitext( os.path.basename(vcf_file)) if simhyb_file_num == 1: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv' else: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv' # open the output SimHyb file if current_simhyb_file.endswith('.gz'): try: current_simhyb_file_id = gzip.open(current_simhyb_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', current_simhyb_file) else: try: current_simhyb_file_id = open(current_simhyb_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', current_simhyb_file) # write allele frequency records for i in range(maximum_allele_number): xlib.Message.print('trace', f'i: {i}') # initialize the variable to control the record begin is_begin = True # species 1 for j in range(begin_variant, end_variant + 1): xlib.Message.print('trace', f'j: {j}') # get the allele and its frequency variant_data_dict = allele_frequency_dict_1.get(j, {}) xlib.Message.print('trace', f'variant_data_dict: {variant_data_dict}') if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the sample if is_begin: record_part = f'{allele};{allele_frequency}' is_begin = False else: record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # species 2 for j in range(begin_variant, end_variant + 1): # get the allele and its frequency variant_data_dict = allele_frequency_dict_2.get(j, {}) if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the variant record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # write the end of the record current_simhyb_file_id.write('\n') # close SymHyb file current_simhyb_file_id.close() # print OK message xlib.Message.print( 'info', f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.' ) # set the new begin and end variant begin_variant = end_variant + 1 end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list): ''' Load data of a VCF file. ''' # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # drop table "vcf_samples" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n') xsqlite.drop_vcf_samples(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples" xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n') xsqlite.create_vcf_samples(conn) xlib.Message.print('verbose', 'The table is created.\n') # insert samples data into table "vcf_samples" xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n') for key, value in sample_dict.items(): value['type'] = 'N/A' xsqlite.insert_vcf_samples_row(conn, value) xlib.Message.print('verbose', 'Data are inserted.\n') # create index "vcf_samples_index" with columns "dataset_id" and "gene_id" (if not exists) xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n') xsqlite.create_vcf_samples_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # get the sample type dictionary sample_type_dict = xsqlite.get_sample_type_dict(conn) # update the type of each sample for key in sample_type_dict.keys(): xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type']) # drop table "vcf_variants" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n') xsqlite.drop_vcf_variants(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_variants" xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n') xsqlite.create_vcf_variants(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n') xsqlite.drop_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_samples_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n') xsqlite.drop_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles" vcf_variants_row_dict = {} vcf_alleles_row_dict = {} vcf_samples_alleles_row_dict = {} # build the list of imputed and missing data alleles M_I_list = [imputed_md_id, xlib.get_md_symbol()] # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 vcf_variants_inserted_row_counter = 0 vcf_alleles_inserted_row_counter = 0 vcf_samples_alleles_inserted_row_counter = 0 # initialize the sample species and mother identification lists per variant sample_id_list = [] species_id_list = [] mother_id_list = [] # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: sample_id = record_data_list[i] species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) sample_id_list.append(sample_id) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith('##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}') if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build the alleles list from reference bases and alternative alleles list if alternative_alleles == xlib.get_md_symbol(): alleles_list = [reference_bases] else: alleles_list = [reference_bases] + alternative_allele_list # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A variant_type = '' if alternative_alleles == xlib.get_md_symbol(): variant_type = 'N/A' else: is_indel = False if len(reference_bases) > 1: is_indel = True else: for alternative_allele in alternative_allele_list: if len(alternative_allele) > 1: is_indel = True break if is_indel: variant_type = 'INDEL' elif len(alternative_allele_list) > 1: variant_type = 'MULTIALLELIC' else: variant_type = 'SNP' # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # set data and insert row into the table "vcf_variants" vcf_variants_row_dict['variant_id'] = variant_id vcf_variants_row_dict['seq_id'] = data_dict['chrom'] vcf_variants_row_dict['position'] = data_dict['pos'] vcf_variants_row_dict['reference_bases'] = reference_bases vcf_variants_row_dict['alternative_alleles'] = alternative_alleles vcf_variants_row_dict['variant_type'] = variant_type xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict) vcf_variants_inserted_row_counter += 1 # set data and insert rows into the table "vcf_alleles" vcf_alleles_row_dict['variant_id'] = variant_id # reference bases and alternative alleles for j in range(len(alleles_list)): vcf_alleles_row_dict['allele_id'] = str(j) vcf_alleles_row_dict['bases'] = alleles_list[j] if xlib.check_int(j) and allele_transformation == 'ADD100': structure_allele_id = str(int(j) + 100) else: structure_allele_id = j vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # missing data vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(new_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(new_md_id) + 100) else: structure_allele_id = new_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # imputed missing data vcf_alleles_row_dict['allele_id'] = imputed_md_id vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(imputed_md_id) + 100) else: structure_allele_id = imputed_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # set data and insert rows into the table "vcf_samples_alleles" vcf_samples_alleles_row_dict['variant_id'] = variant_id for i in range(sample_number): vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i] # initialize genotype distribution dictionary genotype_distribution_dict = {} for j in range(len(alleles_list)): genotype_distribution_dict[alleles_list[j]] = 0 for j in range(len(M_I_list)): genotype_distribution_dict[M_I_list[j]] = 0 # calculate genotype distribution dictionary if sample_gt_left_list[i] in M_I_list: genotype_distribution_dict[sample_gt_left_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1 if sample_gt_right_list[i] in M_I_list: genotype_distribution_dict[sample_gt_right_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1 # calculate precuency and insert rows for reference bases and alternative alleles for j in range(len(alleles_list)): if genotype_distribution_dict[alleles_list[j]] > 0: # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j] vcf_samples_alleles_row_dict['allele_id'] = j vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for imputed missing data if genotype_distribution_dict[imputed_md_id] > 0: vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for missing data if genotype_distribution_dict[xlib.get_md_symbol()] > 0: vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # create the index "vcf_variants_index" on the table "vcf_variants" xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n') xsqlite.create_vcf_variants_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_alleles_index" on the table "vcf_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into NGShelper database xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close the VCF file vcf_file_id.close()
def check_args(args): ''' Verity the input arguments. ''' # initialize the control variable OK = True # check "fasta_file" if args.fasta_file is None: xlib.Message.print( 'error', '*** The input FASTA file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.fasta_file): xlib.Message.print('error', f'*** The file {args.fasta_file} does not exist.') OK = False # check "output_file" if args.output_file is None: xlib.Message.print( 'error', '*** The FASTA file with debased sequences is not indicated in the input arguments.' ) OK = False # check "fragmentation_probability" if args.fragmentation_probability is None: xlib.Message.print( 'error', '*** The fragmentation probability is not indicated in the input arguments.' ) OK = False elif not xlib.check_float(args.fragmentation_probability, minimum=xlib.Const.FRAGPROB_LOWEST, maximum=xlib.Const.FRAGPROB_UPPEST): xlib.Message.print( 'error', f'The fragmentation probability has to be a float number between {xlib.Const.FRAGPROB_LOWEST} and {xlib.Const.FRAGPROB_UPPEST}.' ) OK = False else: args.fragmentation_probability = float(args.fragmentation_probability) # check "max_fragment_number" if args.max_fragment_number is None: xlib.Message.print( 'error', '*** The maximum fragment number is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.max_fragment_number, minimum=xlib.Const.MAXFRAGNUM_LOWEST, maximum=xlib.Const.MAXFRAGNUM_UPPEST): xlib.Message.print( 'error', f'The maximum fragment number has to be a integer number between {xlib.Const.MAXFRAGNUM_LOWEST} and {xlib.Const.MAXFRAGNUM_UPPEST}.' ) OK = False else: args.max_fragment_number = int(args.max_fragment_number) # check "max_end_shortening" if args.max_end_shortening is None: xlib.Message.print( 'error', '*** The maximum shortening of a fragment end is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.max_end_shortening, minimum=xlib.Const.MAXSHORTENING_LOWEST, maximum=xlib.Const.MAXSHORTENING_UPPEST): xlib.Message.print( 'error', f'The maximum shortening of a fragment end has to be a integer number between {xlib.Const.MAXSHORTENING_LOWEST} and {xlib.Const.MAXSHORTENING_UPPEST}.' ) OK = False else: args.max_end_shortening = int(args.max_end_shortening) # check "min_fragment_length" if args.min_fragment_length is None: xlib.Message.print( 'error', '*** The minimum fragment length is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.min_fragment_length, minimum=1): xlib.Message.print( 'error', 'The minimum fragment length has to be a integer number greater than 0.' ) OK = False else: args.min_fragment_length = int(args.min_fragment_length) # check "mutation_probability" if args.mutation_probability is None: xlib.Message.print( 'error', '*** The mutation probability is not indicated in the input arguments.' ) OK = False elif not xlib.check_float(args.mutation_probability, minimum=xlib.Const.MUTPROB_LOWEST, maximum=xlib.Const.MUTPROB_UPPEST): xlib.Message.print( 'error', f'The mutation probability has to be a float number between {xlib.Const.MUTPROB_LOWEST} and {xlib.Const.MUTPROB_UPPEST}' ) OK = False else: args.mutation_probability = float(args.mutation_probability) # check "max_mutation_number" if args.max_mutation_number is None: xlib.Message.print( 'error', '*** The maximum mutation number is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.max_mutation_number, minimum=xlib.Const.MAXMUTNUM_LOWEST, maximum=xlib.Const.MAXMUTNUM_UPPEST): xlib.Message.print( 'error', f'The maximum mutation number has to be a integer number between {xlib.Const.MAXMUTNUM_LOWEST} and {xlib.Const.MAXMUTNUM_UPPEST}.' ) OK = False else: args.max_mutation_number = int(args.max_mutation_number) # check "indel_probability" if args.indel_probability is None: xlib.Message.print( 'error', '*** The insertion/deletion probability is not indicated in the input arguments.' ) OK = False elif not xlib.check_float(args.indel_probability, minimum=xlib.Const.INDELPROB_LOWEST, maximum=xlib.Const.INDELPROB_UPPEST): xlib.Message.print( 'error', f'The insertion/deletion probability has to be a float number between {xlib.Const.INDELPROB_LOWEST} and {xlib.Const.INDELPROB_UPPEST}.' ) OK = False else: args.indel_probability = float(args.indel_probability) # check "max_mutation_size" if args.max_mutation_size is None: xlib.Message.print( 'error', '*** The maximum mutation size size is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.max_mutation_size, minimum=xlib.Const.MAXMUTSIZE_LOWEST, maximum=xlib.Const.MAXMUTSIZE_UPPEST): xlib.Message.print( 'error', f'The maximum mutation size size has to be a integer number between {xlib.Const.MAXMUTSIZE_LOWEST} and {xlib.Const.MAXMUTSIZE_UPPEST}.' ) OK = False else: args.max_mutation_size = int(args.max_mutation_size) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif args.verbose.upper() not in get_verbose_code_list(): xlib.Message.print( 'error', f'The verbose has to be {get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif args.trace.upper() not in get_trace_code_list(): xlib.Message.print( 'error', f'The trace has to be {get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_htseq_count_config_file(strict): ''' Check the htseq-count config file of a run. ''' # initialize the control variable and the error list OK = True error_list = [] # intitialize variable used when value is not found not_found = '***NOTFOUND***'.upper() # get the option dictionary try: htseq_count_option_dict = xlib.get_option_dict( get_htseq_count_config_file()) except Exception as e: error_list.append(f'*** EXCEPTION: "{e}".') error_list.append( '*** ERROR: The option dictionary could not be built from the config file' ) OK = False else: # get the sections list sections_list = [] for section in htseq_count_option_dict.keys(): sections_list.append(section) sections_list.sort() # check section "identification" if 'identification' not in sections_list: error_list.append( '*** ERROR: the section "identification" is not found.') OK = False else: # check section "identification" - key "experiment_id" experiment_id = htseq_count_option_dict.get( 'identification', {}).get('experiment_id', not_found) if experiment_id == not_found: error_list.append( '*** ERROR: the key "experiment_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "reference_dataset_id" reference_dataset_id = htseq_count_option_dict.get( 'identification', {}).get('reference_dataset_id', not_found) if reference_dataset_id == not_found: error_list.append( '*** ERROR: the key "reference_dataset_id" is not found in the section "identification".' ) OK = False # check section "identification" - key "annotation_file" annotation_file = htseq_count_option_dict.get( 'identification', {}).get('annotation_file', not_found) if annotation_file == not_found: error_list.append( '*** ERROR: the key "annotation_file" is not found in the section "identification".' ) OK = False elif os.path.splitext(annotation_file)[1] not in ['.gtf', '.gff']: error_list.append( '*** ERROR: the key "annotation_file" has to be a file name with .gtf/.gff extension.' ) OK = False # check section "alignment-dataset-1" if 'alignment-dataset-1' not in sections_list: error_list.append( '*** ERROR: the section "alignment-dataset-1" is not found.') OK = False # check all sections "alignment-dataset-n" for section in sections_list: if section not in ['identification', 'htseq-count parameters']: # check than the section identification is like alignment-dataset-n if not re.match('^alignment-dataset-[0-9]+$', section): error_list.append( f'*** ERROR: the section "{section}" has a wrong identification.' ) OK = False else: # check section "alignment-dataset-n" - key "alignment_software" alignment_software = htseq_count_option_dict.get( section, {}).get('alignment_software', not_found) if alignment_software == not_found: error_list.append( f'*** ERROR: the key "alignment_software" is not found in the section "{section}".' ) OK = False elif not xlib.check_code( alignment_software, get_alignment_software_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "alignment_software" has to be {get_alignment_software_code_list_text()}.' ) OK = False # check section "alignment-dataset-n" - key "alignment_dataset_id" alignment_dataset_id = htseq_count_option_dict.get( section, {}).get('alignment_dataset_id', not_found) if alignment_dataset_id == not_found: error_list.append( f'*** ERROR: the key "alignment_dataset_id" is not found in the section "{section}".' ) OK = False elif not xlib.check_startswith( alignment_dataset_id, get_alignment_software_code_list(), case_sensitive=True): error_list.append( f'*** ERROR: the key "alignment_dataset_id" has to start with {get_alignment_software_code_list_text()}.' ) OK = False # check section "htseq-count parameters" if 'htseq-count parameters' not in sections_list: error_list.append( '*** ERROR: the section "htseq-count parameters" is not found.' ) OK = False else: # check section "htseq-count parameters" - key "nprocesses" nprocesses = htseq_count_option_dict.get( 'htseq-count parameters', {}).get('nprocesses', not_found) if nprocesses == not_found: error_list.append( '*** ERROR: the key "nprocesses" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_int(nprocesses, minimum=1): error_list.append( '*** ERROR: the key "nprocesses" has to be an integer number greater than or equal to 1.' ) OK = False # check section "htseq-count parameters" - key "stranded" stranded = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'stranded', not_found) if stranded == not_found: error_list.append( '*** ERROR: the key "stranded" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code( stranded, get_stranded_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "stranded" has to be {get_stranded_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "minaqual" minaqual = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'minaqual', not_found) if minaqual == not_found: error_list.append( '*** ERROR: the key "minaqual" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_int(minaqual): error_list.append( '*** ERROR: the key "minaqual" has to be an integer number.' ) OK = False # check section "htseq-count parameters" - key "type" type = htseq_count_option_dict.get('htseq-count parameters', {}).get('type', not_found) if type == not_found: error_list.append( '*** ERROR: the key "type" is not found in the section "htseq-count parameters".' ) OK = False # check section "htseq-count parameters" - key "idattr" idattr = htseq_count_option_dict.get('htseq-count parameters', {}).get('idattr', not_found) if idattr == not_found: error_list.append( '*** ERROR: the key "idattr" is not found in the section "htseq-count parameters".' ) OK = False # check section "htseq-count parameters" - key "mode" mode = htseq_count_option_dict.get('htseq-count parameters', {}).get('mode', not_found) if mode == not_found: error_list.append( '*** ERROR: the key "mode" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code( mode, get_mode_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "mode" has to be {get_mode_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "nonunique" nonunique = htseq_count_option_dict.get('htseq-count parameters', {}).get( 'nonunique', not_found) if nonunique == not_found: error_list.append( '*** ERROR: the key "nonunique" is not found in the section "htseq-count parameters".' ) OK = False elif not xlib.check_code(nonunique, get_nonunique_code_list(), case_sensitive=False): error_list.append( f'*** ERROR: the key "nonunique" has to be {get_nonunique_code_list_text()}.' ) OK = False # check section "htseq-count parameters" - key "other_parameters" not_allowed_parameters_list = [ 'nprocesses', 'format', 'stranded', 'minaqual', 'type', 'idattr', 'mode', 'nonunique', 'quiet' ] other_parameters = htseq_count_option_dict.get( 'htseq-count parameters', {}).get('other_parameters', not_found) if other_parameters == not_found: error_list.append( '*** ERROR: the key "other_parameters" is not found in the section "htseq-count parameters".' ) OK = False elif other_parameters.upper() != 'NONE': (OK, error_list2) = xlib.check_parameter_list( other_parameters, "other_parameters", not_allowed_parameters_list) error_list = error_list + error_list2 # warn that the results config file is not valid if there are any errors if not OK: error_list.append( f'\nThe {xlib.get_htseq_count_name()} config file is not valid. Please, correct this file or recreate it.' ) # return the control variable and the error list return (OK, error_list)
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "vcf_file" if args.vcf_file is None: xlib.Message.print( 'error', '*** The input VCF file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.vcf_file): xlib.Message.print('error', f'*** The file {args.vcf_file} does not exist.') OK = False # check "genome_file" if args.genome_file is None: xlib.Message.print( 'error', '*** The FASTA genome file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.genome_file): xlib.Message.print('error', f'*** The file {args.genome_file} does not exist.') OK = False # check "flanking_region_file" if args.flanking_region_file is None: xlib.Message.print( 'error', '*** The FASTA file with flanking regions is not indicated in the input arguments.' ) OK = False # check "nucleotide_number" if args.nucleotide_number is None: args.nucleotide_number = xlib.Const.DEFAULT_NUCLEOTIDE_NUMBER elif not xlib.check_int(args.nucleotide_number, minimum=1): xlib.Message.print( 'error', 'The minimum combined depth across samples has to be an integer number greater than or equal to 1.' ) OK = False else: args.nucleotide_number = int(args.nucleotide_number) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')