def collapse_indels(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id, hybrid_id, output_vcf_file, stats_file, tvi_list): ''' Collapses the variant records corresponding to an indel in a VCF file. ''' # initialize the sample number sample_number = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample, species and mother identification lists per variant sample_id_list = [] species_id_list = [] mother_id_list = [] # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # open the imputed VCF file if output_vcf_file.endswith('.gz'): try: output_vcf_file_id = gzip.open(output_vcf_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_vcf_file) else: try: output_vcf_file_id = open(output_vcf_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_vcf_file) # open the statistics file if stats_file.endswith('.gz'): try: stats_file_id = gzip.open(stats_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', stats_file) else: try: stats_file_id = open(stats_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', stats_file) # write the statistics header stats_file_id.write('"seq_id";"position";"records";"length";"imputed"\n') # initialize counters input_record_counter = 0 total_variant_counter = 0 collapsed_variant_counter = 0 created_indel_counter = 0 # read the first record of input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # write the metadata record output_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: sample_id = sample_dict[record_data_list[i]]['sample_id'] species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) sample_id_list.append(sample_id) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # write the column description record output_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): xlib.Message.print('trace', f'Iniciando...') # set the sequence identification and position control variables w_seq_id = data_dict['chrom'] w_position = int(data_dict['pos']) # initialize the record counter of the "actual" variant actual_variant_record_counter = 0 # initialize the reference bases (field REF) reference_bases = '' # initialize the found best sample list control variable found_best_sample_list = False # initialize the collapse control variable collapse = True # process variant records of same "actual" variant while record != '' and not record.startswith( '##' ) and not record.startswith( '#CHROM' ) and data_dict['chrom'] == w_seq_id and int( data_dict['pos'] ) == w_position + actual_variant_record_counter and collapse: xlib.Message.print('trace', f'Inside the loop') xlib.Message.print( 'trace', f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 # add 1 to the record counter of the "actual" variant actual_variant_record_counter += 1 # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # initialize imputation control variable imputed_adult_count = 0 # check for i in range(sample_number): # only when the sample is adult if mother_id_list[i] == 'NONE': # check if there are imputed data if sample_gt_left_list[ i] == imputed_md_id or sample_gt_right_list[ i] == imputed_md_id: imputed_adult_count += 1 xlib.Message.print( 'trace', f'variant_id: {variant_id} - imputed_adult_count: {imputed_adult_count}' ) # concat the current reference bases to the new reference bases reference_bases = f'{reference_bases}{data_dict["ref"]}' # if there are not imputed adults if imputed_adult_count == 0: id = data_dict['id'] alternative_alleles = data_dict['alt'] qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] collapse = False # if there are imputed adults else: if actual_variant_record_counter == 1: id = data_dict['id'] alternative_alleles = data_dict['alt'] qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] if alternative_alleles == xlib.get_md_symbol(): found_best_sample_list = True elif not found_best_sample_list and data_dict[ 'alt'] == xlib.get_md_symbol(): id = data_dict['id'] alternative_alleles = xlib.get_md_symbol() qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] found_best_sample_list = True # read the next record of the input VCF file xlib.Message.print('trace', f'Reading ...') (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) if record != '': xlib.Message.print( 'trace', f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) # write the variant record xlib.Message.print('trace', f'Writing VCF ...') xlib.Message.print( 'trace', f'w_seq_id: {w_seq_id} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) sample_list_text = '\t'.join(best_sample_list) output_vcf_file_id.write( f'{w_seq_id}\t{w_position}\t{id}\t{reference_bases}\t{alternative_alleles}\t{qual}\t{filter}\t{info}\t{format}\t{sample_list_text}\n' ) # write the collapsing statistics record xlib.Message.print('trace', f'Writing stats...') is_imputed = 'IMPUTED' if imputed_adult_count > 0 else '-' stats_file_id.write( f'{w_seq_id};{w_position};{actual_variant_record_counter};{len(reference_bases)};{is_imputed}\n' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() output_vcf_file_id.close() stats_file_id.close() # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(output_vcf_file)} is created.')
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list): ''' Load data of a VCF file. ''' # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # drop table "vcf_samples" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n') xsqlite.drop_vcf_samples(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples" xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n') xsqlite.create_vcf_samples(conn) xlib.Message.print('verbose', 'The table is created.\n') # insert samples data into table "vcf_samples" xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n') for key, value in sample_dict.items(): value['type'] = 'N/A' xsqlite.insert_vcf_samples_row(conn, value) xlib.Message.print('verbose', 'Data are inserted.\n') # create index "vcf_samples_index" with columns "dataset_id" and "gene_id" (if not exists) xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n') xsqlite.create_vcf_samples_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # get the sample type dictionary sample_type_dict = xsqlite.get_sample_type_dict(conn) # update the type of each sample for key in sample_type_dict.keys(): xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type']) # drop table "vcf_variants" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n') xsqlite.drop_vcf_variants(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_variants" xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n') xsqlite.create_vcf_variants(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n') xsqlite.drop_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_samples_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n') xsqlite.drop_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles" vcf_variants_row_dict = {} vcf_alleles_row_dict = {} vcf_samples_alleles_row_dict = {} # build the list of imputed and missing data alleles M_I_list = [imputed_md_id, xlib.get_md_symbol()] # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 vcf_variants_inserted_row_counter = 0 vcf_alleles_inserted_row_counter = 0 vcf_samples_alleles_inserted_row_counter = 0 # initialize the sample species and mother identification lists per variant sample_id_list = [] species_id_list = [] mother_id_list = [] # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: sample_id = record_data_list[i] species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) sample_id_list.append(sample_id) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith('##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}') if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build the alleles list from reference bases and alternative alleles list if alternative_alleles == xlib.get_md_symbol(): alleles_list = [reference_bases] else: alleles_list = [reference_bases] + alternative_allele_list # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A variant_type = '' if alternative_alleles == xlib.get_md_symbol(): variant_type = 'N/A' else: is_indel = False if len(reference_bases) > 1: is_indel = True else: for alternative_allele in alternative_allele_list: if len(alternative_allele) > 1: is_indel = True break if is_indel: variant_type = 'INDEL' elif len(alternative_allele_list) > 1: variant_type = 'MULTIALLELIC' else: variant_type = 'SNP' # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # set data and insert row into the table "vcf_variants" vcf_variants_row_dict['variant_id'] = variant_id vcf_variants_row_dict['seq_id'] = data_dict['chrom'] vcf_variants_row_dict['position'] = data_dict['pos'] vcf_variants_row_dict['reference_bases'] = reference_bases vcf_variants_row_dict['alternative_alleles'] = alternative_alleles vcf_variants_row_dict['variant_type'] = variant_type xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict) vcf_variants_inserted_row_counter += 1 # set data and insert rows into the table "vcf_alleles" vcf_alleles_row_dict['variant_id'] = variant_id # reference bases and alternative alleles for j in range(len(alleles_list)): vcf_alleles_row_dict['allele_id'] = str(j) vcf_alleles_row_dict['bases'] = alleles_list[j] if xlib.check_int(j) and allele_transformation == 'ADD100': structure_allele_id = str(int(j) + 100) else: structure_allele_id = j vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # missing data vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(new_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(new_md_id) + 100) else: structure_allele_id = new_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # imputed missing data vcf_alleles_row_dict['allele_id'] = imputed_md_id vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(imputed_md_id) + 100) else: structure_allele_id = imputed_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # set data and insert rows into the table "vcf_samples_alleles" vcf_samples_alleles_row_dict['variant_id'] = variant_id for i in range(sample_number): vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i] # initialize genotype distribution dictionary genotype_distribution_dict = {} for j in range(len(alleles_list)): genotype_distribution_dict[alleles_list[j]] = 0 for j in range(len(M_I_list)): genotype_distribution_dict[M_I_list[j]] = 0 # calculate genotype distribution dictionary if sample_gt_left_list[i] in M_I_list: genotype_distribution_dict[sample_gt_left_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1 if sample_gt_right_list[i] in M_I_list: genotype_distribution_dict[sample_gt_right_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1 # calculate precuency and insert rows for reference bases and alternative alleles for j in range(len(alleles_list)): if genotype_distribution_dict[alleles_list[j]] > 0: # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j] vcf_samples_alleles_row_dict['allele_id'] = j vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for imputed missing data if genotype_distribution_dict[imputed_md_id] > 0: vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for missing data if genotype_distribution_dict[xlib.get_md_symbol()] > 0: vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # create the index "vcf_variants_index" on the table "vcf_variants" xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n') xsqlite.create_vcf_variants_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_alleles_index" on the table "vcf_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into NGShelper database xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close the VCF file vcf_file_id.close()
def extract_vcf_genotypes(input_vcf_file, imputed_md_id, output_genotype_file, tvi_list): ''' Extract genotype data of every variant from a VCF file. ''' # initialize the sample number sample_number = 0 # initialize the maximum allele number maximum_allele_number = 0 # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # set temporal genotype data file name if output_genotype_file.endswith('.gz'): tmp_genotype_file = f'{output_genotype_file[:-3]}.tmp.gz' else: tmp_genotype_file = f'{output_genotype_file}.tmp' # open the temporal genotype data file if tmp_genotype_file.endswith('.gz'): try: tmp_genotype_file_id = gzip.open(tmp_genotype_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', tmp_genotype_file) else: try: tmp_genotype_file_id = open(tmp_genotype_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', tmp_genotype_file) # write the header of the temporal genotype data file tmp_genotype_file_id.write('seq_id;position;ref;alt;genotype;counter\n') # initialize counters input_record_counter = 0 total_variant_counter = 0 # read the first record of input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # set the sample number sample_number = len(record_data_list) - 9 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith('##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}') # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases} - alternative_alleles: {alternative_alleles}') # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') try: alternative_allele_list.remove(xlib.get_md_symbol()) except: pass # set the maximum allele number if maximum_allele_number < 1 + len(alternative_allele_list): maximum_allele_number = 1 + len(alternative_allele_list) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol(): sample_gt_left_list.append(xlib.get_md_symbol()) elif sample_gt_list[i][:sep_pos] == imputed_md_id: sample_gt_left_list.append(99) else: sample_gt_left_list.append(int(sample_gt_list[i][:sep_pos])) if sample_gt_list[i][sep_pos+1:] == xlib.get_md_symbol(): sample_gt_right_list.append(xlib.get_md_symbol()) elif sample_gt_list[i][sep_pos+1:] == imputed_md_id: sample_gt_right_list.append(99) else: sample_gt_right_list.append(int(sample_gt_list[i][sep_pos+1:])) # initialize genotype counter dictionary genotype_counter_dict = {} for j in range(1 + len(alternative_allele_list) + 1): for k in range(j, 1 + len(alternative_allele_list) + 1): if j != (1 + len(alternative_allele_list)) and k != (1 + len(alternative_allele_list)): genotype_counter_dict[f'{j}/{k}'] = 0 elif j != (1 + len(alternative_allele_list)) and k == (1 + len(alternative_allele_list)): genotype_counter_dict[f'{j}/99'] = 0 elif j == (1 + len(alternative_allele_list)) and k != (1 + len(alternative_allele_list)): genotype_counter_dict[f'99/{k}'] = 0 elif j == (1 + len(alternative_allele_list)) and k == (1 + len(alternative_allele_list)): genotype_counter_dict['99/99'] = 0 if variant_id in tvi_list: xlib.Message.print('trace', f'genotype_counter_dict: {genotype_counter_dict}') # initialize missing data counter md_counter = 0 # count genotypes for i in range(sample_number): if sample_gt_left_list[i] == xlib.get_md_symbol() or sample_gt_right_list == xlib.get_md_symbol(): md_counter += 1 else: if sample_gt_left_list[i] <= sample_gt_right_list[i]: j = sample_gt_left_list[i] k = sample_gt_right_list[i] else: j = sample_gt_right_list[i] k = sample_gt_left_list[i] genotype_counter_dict[f'{j}/{k}'] = genotype_counter_dict[f'{j}/{k}'] + 1 if variant_id in tvi_list: xlib.Message.print('trace', f'genotype_counter_dict: {genotype_counter_dict}') # write the variant gewnotype count records for key in genotype_counter_dict.keys(): tmp_genotype_file_id.write(f'{data_dict["chrom"]};{data_dict["pos"]};{reference_bases};{alternative_alleles};{key};{genotype_counter_dict[key]}\n') tmp_genotype_file_id.write(f'{data_dict["chrom"]};{data_dict["pos"]};{reference_bases};{alternative_alleles};{xlib.get_md_symbol()};{md_counter}\n') # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() tmp_genotype_file_id.close() # print OK message xlib.Message.print('info', f'The file {os.path.basename(tmp_genotype_file)} is created.') # open the temporal genotype data file if tmp_genotype_file.endswith('.gz'): try: tmp_genotype_file_id = gzip.open(tmp_genotype_file, mode='rt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F002', tmp_genotype_file) else: try: tmp_genotype_file_id = open(tmp_genotype_file, mode='r', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F001', tmp_genotype_file) # open the genotype data file if output_genotype_file.endswith('.gz'): try: output_genotype_file_id = gzip.open(output_genotype_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_genotype_file) else: try: output_genotype_file_id = open(output_genotype_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_genotype_file) # initialize record counters input_record_counter = 0 # write the header of the genotype data file maximum_variant_list = [] for j in range(maximum_allele_number + 1): for k in range(j, maximum_allele_number + 1): if j != (maximum_allele_number) and k != (maximum_allele_number): maximum_variant_list.append(f'{j}/{k}') elif j != (maximum_allele_number) and k == (maximum_allele_number): maximum_variant_list.append(f'{j}/99') elif j == (maximum_allele_number) and k != (maximum_allele_number): maximum_variant_list.append(f'99/{k}') elif j == (maximum_allele_number) and k == (maximum_allele_number): maximum_variant_list.append('99/99') maximum_variant_list.append('.') if variant_id in tvi_list: xlib.Message.print('trace', f'maximum_variant_list: {maximum_variant_list}') output_genotype_file_id.write('seq_id;position;ref;alt;{0}\n'.format(';'.join(maximum_variant_list))) # read the first record of the temporal genotype data file (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter) # set the first record control variable first_record = True # while there are records in the temporal genotype data file while record != '': # the header record if first_record: # set the first record control variable first_record = False # read the next record of the temporal genotype data file (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter) # data records else: # save old values old_seq_id = data_dict['seq_id'] old_position = data_dict['position'] old_ref = data_dict['ref'] old_alt = data_dict['alt'] # initialize genotype counter dictionary genotype_counter_dict = {} for j in range(maximum_allele_number + 1): for k in range(j, maximum_allele_number + 1): if j != maximum_allele_number and k != maximum_allele_number: genotype_counter_dict[f'{j}/{k}'] = 0 elif j != maximum_allele_number and k == maximum_allele_number: genotype_counter_dict[f'{j}/99'] = 0 elif j == maximum_allele_number and k != maximum_allele_number: genotype_counter_dict[f'99/{k}'] = 0 elif j == maximum_allele_number and k == maximum_allele_number: genotype_counter_dict['99/99'] = 0 genotype_counter_dict['.'] = 0 if f'{old_seq_id}-{old_position}' in tvi_list: xlib.Message.print('trace', f'***genotype_counter_dict: {genotype_counter_dict}') # while there are records in the temporal genotype data file and the same variant while record != '' and data_dict['seq_id'] == old_seq_id and data_dict['position'] == old_position: # save the genotype counter in the genotype counter dictionary genotype_counter_dict[data_dict['genotype']] = data_dict['counter'] # read the next record of the temporal genotype data file (record, _, data_dict) = read_temporal_genotype_data_file_record(tmp_genotype_file, tmp_genotype_file_id, input_record_counter) if f'{old_seq_id}-{old_position}' in tvi_list: xlib.Message.print('trace', f'***genotype_counter_dict: {genotype_counter_dict}') # write the variant gewnotype count records genotype_counter_list = [] if sample_number - int(genotype_counter_dict['.']) > 0: for j in range(maximum_allele_number + 1): for k in range(j, maximum_allele_number + 1): if j != (maximum_allele_number) and k != (maximum_allele_number): genotype_counter_list.append(str(int(genotype_counter_dict[f'{j}/{k}'])/(sample_number - int(genotype_counter_dict['.'])))) elif j != (maximum_allele_number) and k == (maximum_allele_number): genotype_counter_list.append(str(int(genotype_counter_dict[f'{j}/99'])/(sample_number - int(genotype_counter_dict['.'])))) elif j == (maximum_allele_number) and k != (maximum_allele_number): genotype_counter_list.append(str(int(genotype_counter_dict[f'99/{k}'])/(sample_number - int(genotype_counter_dict['.'])))) elif j == (maximum_allele_number) and k == (maximum_allele_number): genotype_counter_list.append(str(int(genotype_counter_dict['99/99'])/(sample_number - int(genotype_counter_dict['.'])))) genotype_counter_list.append(genotype_counter_dict['.']) genotype_counter_list_text = ';'.join(genotype_counter_list) output_genotype_file_id.write(f'{old_seq_id};{old_position};{old_ref};{old_alt};{genotype_counter_list_text}\n') # close files tmp_genotype_file_id.close() output_genotype_file_id.close() # print OK message xlib.Message.print('info', f'The file {os.path.basename(output_genotype_file)} is created.')
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, output_dir, variant_number_per_file, allele_transformation, tvi_list): ''' Filter and fixes variant data of a VCF file. ''' # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample species and mother identification lists per variant species_id_list = [] mother_id_list = [] # initialize the maximum allele number per varaint maximum_allele_number = 0 # initialize allele frequency dictionaries allele_frequency_dict_1 = {} allele_frequency_dict_2 = {} # initialize ATCG conversión dictionary # A -> 1; T -> 2; C -> 3; G -> 4 atcg = 'ATCG' atcg_conversion_dict = {} # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException(e, 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) if variant_id in tvi_list: xlib.Message.print( 'trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build ATCG conversion list atcg_conversion_list = [] index = atcg.find(reference_bases.upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) for i in range(len(alternative_allele_list)): index = atcg.find(alternative_allele_list[i].upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) atcg_conversion_dict[total_variant_counter] = atcg_conversion_list # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # get the allele counters per species allele_counter_dict_1 = {} allele_counter_dict_2 = {} allele_counter_dict_h = {} for i in range(sample_number): # only when the sample is an adult if mother_id_list[i] == 'NONE': if sample_gt_left_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_left_list[ i]] = allele_counter_dict_1.get( sample_gt_left_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_left_list[ i]] = allele_counter_dict_2.get( sample_gt_left_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_left_list[ i]] = allele_counter_dict_h.get( sample_gt_left_list[i], 0) + 1 if sample_gt_right_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_right_list[ i]] = allele_counter_dict_1.get( sample_gt_right_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_right_list[ i]] = allele_counter_dict_2.get( sample_gt_right_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_right_list[ i]] = allele_counter_dict_h.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_1: {allele_counter_dict_1}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_2: {allele_counter_dict_2}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_h: {allele_counter_dict_h}') # calculate the maximum allele number if maximum_allele_number < len(allele_counter_dict_1.keys()): maximum_allele_number = len(allele_counter_dict_1.keys()) if maximum_allele_number < len(allele_counter_dict_2.keys()): maximum_allele_number = len(allele_counter_dict_2.keys()) # calculate the variant allele frecuencies per species allele_frequency_dict_1[total_variant_counter] = {} sp1_allele_total = 0 for allele in allele_counter_dict_1.keys(): sp1_allele_total += allele_counter_dict_1[allele] for allele in allele_counter_dict_1.keys(): allele_frequency_dict_1[total_variant_counter][ allele] = allele_counter_dict_1[allele] / sp1_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}' ) allele_frequency_dict_2[total_variant_counter] = {} sp2_allele_total = 0 for allele in allele_counter_dict_2.keys(): sp2_allele_total += allele_counter_dict_2[allele] for allele in allele_counter_dict_2.keys(): allele_frequency_dict_2[total_variant_counter][ allele] = allele_counter_dict_2[allele] / sp2_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close the VCF file vcf_file_id.close() # calculate the output SimHyb file number simhyb_file_num = math.ceil(total_variant_counter / variant_number_per_file) # initialize the begin and end variant begin_variant = 1 end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter # write the variant allele frecuencies per species in the output SimHyb files for i in range(simhyb_file_num): xlib.Message.print( 'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format( begin_variant, end_variant)) # set the SimHyb file name if vcf_file.endswith('.gz'): file_name, file_extension = os.path.splitext( os.path.basename(vcf_file[:-3])) else: file_name, file_extension = os.path.splitext( os.path.basename(vcf_file)) if simhyb_file_num == 1: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv' else: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv' # open the output SimHyb file if current_simhyb_file.endswith('.gz'): try: current_simhyb_file_id = gzip.open(current_simhyb_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', current_simhyb_file) else: try: current_simhyb_file_id = open(current_simhyb_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', current_simhyb_file) # write allele frequency records for i in range(maximum_allele_number): xlib.Message.print('trace', f'i: {i}') # initialize the variable to control the record begin is_begin = True # species 1 for j in range(begin_variant, end_variant + 1): xlib.Message.print('trace', f'j: {j}') # get the allele and its frequency variant_data_dict = allele_frequency_dict_1.get(j, {}) xlib.Message.print('trace', f'variant_data_dict: {variant_data_dict}') if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the sample if is_begin: record_part = f'{allele};{allele_frequency}' is_begin = False else: record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # species 2 for j in range(begin_variant, end_variant + 1): # get the allele and its frequency variant_data_dict = allele_frequency_dict_2.get(j, {}) if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the variant record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # write the end of the record current_simhyb_file_id.write('\n') # close SymHyb file current_simhyb_file_id.close() # print OK message xlib.Message.print( 'info', f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.' ) # set the new begin and end variant begin_variant = end_variant + 1 end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
def convert_vcf_to_phase_input(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, allele_transformation, output_dir, tvi_list): ''' Convert a VCF file to the PHASE input format. ''' # initialize the sample number sample_number = 0 # initialize the sample information list sample_info_list = [] # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample species identification list per variant species_id_list = [] # open the VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # initialize counters seq_counter = 0 variant_counter = 0 record_counter = 0 # read the first record of VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample information list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) if species_id == sp1_id: numeric_species_id = 1 elif species_id == sp2_id: numeric_species_id = 2 else: numeric_species_id = 3 sample_info_list.append( [record_data_list[i], numeric_species_id]) # build the sample species list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant records while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the sequence counter seq_counter += 1 # initialize VCF record counter variant_counter = 0 # save the sequence old_seq = data_dict['chrom'] # initialize the list of variant positions variant_position_list = [] # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes gt_left_matrix = [] gt_right_matrix = [] # initialize the list of the variant multiallelic status variant_multiallelic_status_list = [] # process variant records of the same sequence while record != '' and not record.startswith( '##') and not record.startswith( '#CHROM') and data_dict['chrom'] == old_seq: # add 1 to the VCF record counter record_counter += 1 # add 1 to the total variant counter variant_counter += 1 # append position to the list of variant positions variant_position_list.append(data_dict['pos']) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) # get the allele counters per species allele_counter_dict = {} for i in range(sample_number): if sample_gt_left_list[i] != xlib.get_md_symbol(): allele_counter_dict[ sample_gt_left_list[i]] = allele_counter_dict.get( sample_gt_left_list[i], 0) + 1 if sample_gt_right_list[i] != xlib.get_md_symbol(): allele_counter_dict[ sample_gt_right_list[i]] = allele_counter_dict.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict: {allele_counter_dict}') # check if the variant is multiallelic if len(allele_counter_dict.keys()) > 2: variant_multiallelic_status = 'M' else: variant_multiallelic_status = 'S' if variant_id in tvi_list: xlib.Message.print( 'trace', f'variant_multiallelic_status: {variant_multiallelic_status}.' ) # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes gt_left_matrix.append(sample_gt_left_list) gt_right_matrix.append(sample_gt_right_list) # append to the list of the variant multiallelic status variant_multiallelic_status_list.append( variant_multiallelic_status) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Seqs ... {seq_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # set output converted file of the sequence if vcf_file.endswith('.gz'): file_name, file_extension = os.path.splitext( os.path.basename(vcf_file[:-3])) else: file_name, file_extension = os.path.splitext( os.path.basename(vcf_file)) seq_output_converted_file = f'{output_dir}/{file_name}-2phase-{old_seq}.txt' # open the output converted file if seq_output_converted_file.endswith('.gz'): try: seq_output_converted_file_id = gzip.open( seq_output_converted_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', seq_output_converted_file) else: try: seq_output_converted_file_id = open( seq_output_converted_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', seq_output_converted_file) # write header records header_record_1 = f'{sample_number}\n' seq_output_converted_file_id.write(header_record_1) header_record_2 = f'{len(variant_position_list)}\n' seq_output_converted_file_id.write(header_record_2) header_record_3 = f'P {" ".join(variant_position_list)}\n' seq_output_converted_file_id.write(header_record_3) header_record_4 = f'{"".join(variant_multiallelic_status_list)}\n' seq_output_converted_file_id.write(header_record_4) # write sample records for i in range(sample_number): # build left and right side lists of variants of a sample sample_variant_gt_left_list = [] sample_variant_gt_right_list = [] for j in range(len(variant_position_list)): # left if gt_left_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'S': allele_left = '?' elif gt_left_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'M': allele_left = '-1' elif xlib.check_int( gt_left_matrix[j] [i]) and allele_transformation == 'ADD100': allele_left = str(int(gt_left_matrix[j][i]) + 100) else: allele_left = gt_left_matrix[j][i] sample_variant_gt_left_list.append(allele_left) # right if gt_right_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'S': allele_right = '?' elif gt_right_matrix[j][ i] == '.' and variant_multiallelic_status_list[ j] == 'M': allele_right = '-1' elif xlib.check_int( gt_right_matrix[j] [i]) and allele_transformation == 'ADD100': allele_right = str(int(gt_right_matrix[j][i]) + 100) else: allele_right = gt_right_matrix[j][i] sample_variant_gt_right_list.append(allele_right) # write the first record of the sample sample_record_1 = f'#{sample_info_list[i][0]}\n' seq_output_converted_file_id.write(sample_record_1) # write the second record of the sample sample_record_2 = f'{" ".join(sample_variant_gt_left_list)}\n' seq_output_converted_file_id.write(sample_record_2) # write the third record of the sample sample_record_3 = f'{" ".join(sample_variant_gt_right_list)}\n' seq_output_converted_file_id.write(sample_record_3) # close file seq_output_converted_file_id.close() xlib.Message.print('verbose', '\n') # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(seq_output_converted_file)} is created.' ) # close VCF file vcf_file_id.close()
def convert_vcf_to_structure(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, structure_file_type, output_converted_file, tvi_list): ''' Convert a VCF file to the Structure input formats. ''' # initialize the sample number sample_number = 0 # initialize the sample information list sample_info_list = [] # initialize the variant code list variant_code_list = [] # initialize the matrices (rows: variants; columns: samples) on left and right sides of genotypes gt_left_matrix = [] gt_right_matrix = [] # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # open the VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # initialize counters record_counter = 0 variant_counter = 0 # read the first record of VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample information list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) if species_id == sp1_id: numeric_species_id = 1 elif species_id == sp2_id: numeric_species_id = 2 else: numeric_species_id = 3 sample_info_list.append( [record_data_list[i], numeric_species_id]) # check if the sample information list is empty if sample_info_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(sample_info_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant records while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the VCF record counter record_counter += 1 # add 1 to the variant counter variant_counter += 1 # append variant code to the variant code list and write the code and its sequence identification and position in the variant file id = f'{data_dict["chrom"]}-{data_dict["pos"]}' variant_code_list.append(id) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) if variant_id in tvi_list: xlib.Message.print('trace', f'(4) sample_gt_list: {sample_gt_list}') # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) try: if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol(): sample_gt_left_list.append(new_md_id) else: sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) if sample_gt_list[i][sep_pos + 1:] == xlib.get_md_symbol(): sample_gt_right_list.append(new_md_id) else: sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) except Exception as e: raise xlib.ProgramException(e, 'L008', 'GT', data_dict['chrom'], data_dict['pos']) # append a row to the matrices (rows: variant; columns: samples) of left and right sides of genotypes gt_left_matrix.append(sample_gt_left_list) gt_right_matrix.append(sample_gt_right_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close the VCF file vcf_file_id.close() # review the imputed missing data when the type of the converted file is 1 if structure_file_type == '1': # detect variants with any imputed missing data excluded_variant_index_list = [] for i in range(len(gt_left_matrix)): for j in range(sample_number): if gt_left_matrix[i][j] == imputed_md_id or gt_right_matrix[i][ j] == imputed_md_id: excluded_variant_index_list.append(i) break xlib.Message.print( 'trace', 'excluded_variant_index_list: {}'.format( excluded_variant_index_list)) # remove data of variants with any imputed missing data excluded_variant_index_list.reverse() for k in excluded_variant_index_list: variant_code_list.pop(k) gt_left_matrix.pop(k) gt_right_matrix.pop(k) # open the output converted file if output_converted_file.endswith('.gz'): try: output_converted_file_id = gzip.open(output_converted_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_converted_file) else: try: output_converted_file_id = open(output_converted_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_converted_file) # write header record variant_code_list_text = '\t'.join(variant_code_list) output_converted_file_id.write( f'sample_id\tspecies_id\t{variant_code_list_text}\n') # write sample records for i in range(sample_number): # build left and right side lists of variants of a sample sample_variant_gt_left_list = [] sample_variant_gt_right_list = [] for j in range(len(gt_left_matrix)): # left if xlib.check_int(gt_left_matrix[j] [i]) and allele_transformation == 'ADD100': allele_left = str(int(gt_left_matrix[j][i]) + 100) else: allele_left = gt_left_matrix[j][i] sample_variant_gt_left_list.append(allele_left) # right if xlib.check_int(gt_right_matrix[j] [i]) and allele_transformation == 'ADD100': allele_right = str(int(gt_right_matrix[j][i]) + 100) else: allele_right = gt_right_matrix[j][i] sample_variant_gt_right_list.append(allele_right) # write the first record of the sample sample_variant_gt_left_list_text = '\t'.join( sample_variant_gt_left_list) output_converted_file_id.write( f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_left_list_text}\n' ) # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_left_list)}\n') # write the second record of the sample sample_variant_gt_right_list_text = '\t'.join( sample_variant_gt_right_list) output_converted_file_id.write( f'{sample_info_list[i][0]}\t{sample_info_list[i][1]}\t{sample_variant_gt_right_list_text}\n' ) # -- output_converted_file_id.write(f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";".join(sample_variant_gt_right_list)}\n') # close file output_converted_file_id.close() # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(output_converted_file)} is created.' )
def filter_variant(input_vcf_file, value, output_purged_file): ''' Filter variants containing a determined value in left or right sides of sample genotypes in a VCF file. ''' # initialize the sample number sample_number = 0 # initialize the non-filtered sequence identification list non_filtered_seq_id_list = [] # set the temporal VCF file temporal_vcf_file = f'{output_purged_file}.tmp' # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # open the temporal VCF file if temporal_vcf_file.endswith('.gz'): try: temporal_vcf_file_id = gzip.open(temporal_vcf_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', temporal_vcf_file) else: try: temporal_vcf_file_id = open(temporal_vcf_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', temporal_vcf_file) # initialize counters input_record_counter = 0 total_variant_counter = 0 filtered_variant_counter = 0 # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # write the metadata record temporal_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # set the sample number sample_number = len(record_data_list) - 9 # write the column description record temporal_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) # initialize the control variable to write the variant write_the_variant = True # detect value in left or right sides of sample genotypes for i in range(sample_number): if sample_gt_left_list[i] == value or sample_gt_right_list[ i] == value: write_the_variant = False break # if the process has to write the variant if write_the_variant: # rebuild the list of the field GT for every sample for i in range(sample_number): sample_gt_list[ i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}' # rebuild the alternative alleles and its corresponding record data alternative_alleles = ','.join(alternative_allele_list) # rebuild the sample genotype data list and their corresponding record data sample_list = [] for i in range(sample_number): sample_data_list[gt_position] = sample_gt_list[i] sample_list.append(':'.join(sample_data_list)) # add the sequence identification to the non filtered sequence identification list if data_dict['chrom'] not in non_filtered_seq_id_list: non_filtered_seq_id_list.append(data_dict['chrom']) # write the variant record sample_list_text = '\t'.join(sample_list) temporal_vcf_file_id.write( f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n' ) # if the process does not have to write the variant else: # add 1 to the filtered variant counter filtered_variant_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() temporal_vcf_file_id.close() # print OK message xlib.Message.print( 'info', f'The temporal file {os.path.basename(temporal_vcf_file)} containing the filtered variants is created.' ) xlib.Message.print('info', 'Removing metadata of filtered variants ...') # open the temporal VCF file if temporal_vcf_file.endswith('.gz'): try: temporal_vcf_file_id = gzip.open(temporal_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', temporal_vcf_file) else: try: temporal_vcf_file_id = open(temporal_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', temporal_vcf_file) # open the output purged file if output_purged_file.endswith('.gz'): try: output_purged_file_id = gzip.open(output_purged_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_purged_file) else: try: output_purged_file_id = open(output_purged_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_purged_file) # read the first record of temporal VCF file record = temporal_vcf_file_id.readline() # while there are records in temporal VCF file while record != '': # process contig records if record.startswith('##contig'): # get the sequence identification and the position seq_id = '' i1 = 13 i2 = record.find(',', i1) if i2 > -1: seq_id = record[i1:i2] # write the record when the sequence identification was not filtered if seq_id in non_filtered_seq_id_list: output_purged_file_id.write(record) # process other records else: # write record output_purged_file_id.write(record) # read the next record record = temporal_vcf_file_id.readline() # close files temporal_vcf_file_id.close() output_purged_file_id.close() # print OK message xlib.Message.print( 'info', f'The purged file {os.path.basename(output_purged_file)} is created.') # delete temporal VCF file os.remove(temporal_vcf_file) xlib.Message.print( 'info', f'The temporal VCF file {os.path.basename(temporal_vcf_file)} is deleted.' )
def change_value(input_vcf_file, value, new_value, output_purged_file): ''' Change a value in left and right sides of sample genotypes by a new value in a VCF file. ''' # initialize the sample number sample_number = 0 # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # open the output purged file if output_purged_file.endswith('.gz'): try: output_purged_file_id = gzip.open(output_purged_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_purged_file) else: try: output_purged_file_id = open(output_purged_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_purged_file) # initialize counters input_record_counter = 0 total_variant_counter = 0 changed_data = 0 # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # write the metadata record output_purged_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # set the sample number sample_number = len(record_data_list) - 9 # write the column description record output_purged_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) # change the value in left and right sides of sample genotypes for i in range(sample_number): if sample_gt_left_list[i] == value: sample_gt_left_list[i] = new_value changed_data += 1 if sample_gt_right_list[i] == value: sample_gt_right_list[i] = new_value changed_data += 1 # rebuild the list of the field GT for every sample for i in range(sample_number): sample_gt_list[ i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}' # rebuild the alternative alleles and its corresponding record data alternative_alleles = ','.join(alternative_allele_list) # rebuild the sample genotype data list and their corresponding record data sample_list = [] for i in range(sample_number): sample_data_list[gt_position] = sample_gt_list[i] sample_list.append(':'.join(sample_data_list)) # write the variant record sample_list_text = '\t'.join(sample_list) output_purged_file_id.write( f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Changed data ... {changed_data:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() output_purged_file_id.close() # print OK message xlib.Message.print( 'info', f'The purged file {os.path.basename(output_purged_file)} is created.')
def build_haplotype(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id, hybrid_id, haplotype_file, tvi_list): ''' Builds the haplotype of a sample set from a VCF file. ''' # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 total_seq_counter = 0 # initialize the sample information list sample_info_list = [] # initialize the sequence code list seq_code_list = [] # initialize the haplotype matrix (rows: sequences; columns: samples) haplotype_matrix = [] # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # open the VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # read the first record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample information list for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) if species_id == sp1_id: numeric_species_id = 1 elif species_id == sp2_id: numeric_species_id = 2 else: numeric_species_id = 3 sample_info_list.append( [record_data_list[i], numeric_species_id]) # check if the sample information list is empty if sample_info_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(sample_info_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}' ) # read the next record of the VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant records while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the total sequence counter total_seq_counter += 1 # set the old key old_key = key # append sequence identification to the sequence code list seq_code_list.append(data_dict['chrom']) # initialize the sequence haplotype list seq_haplotype_list = [] while record != '' and not record.startswith( '##') and not record.startswith( '#CHROM') and old_key == key: # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') if variant_id in tvi_list: xlib.Message.print( 'trace', f'alternative_allele_list: {alternative_allele_list}') # check if the variant is an indel (to SAMtools/BCFtools and Freebayes) is_indel = False if len(reference_bases) > 1: is_indel = True else: for alternative_allele in alternative_allele_list: if len(alternative_allele) > 1: is_indel = True break if variant_id in tvi_list: xlib.Message.print('trace', f'INDEL?: {is_indel}') # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the sample nucleotide list of a variant sample_nuclotide_list = [] for i in range(sample_number): # if the variant is not an INDEL: if not is_indel: sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException( 'L008', 'GT', data_dict['chrom'], data_dict['pos']) if sample_gt_list[i][:sep_pos] == xlib.get_md_symbol( ) or sample_gt_list[i][sep_pos + 1:] == xlib.get_md_symbol(): nucleotide = 'N' elif sample_gt_list[ i][:sep_pos] == imputed_md_id or sample_gt_list[ i][sep_pos + 1:] == imputed_md_id: nucleotide = 'U' else: try: left_number = int(sample_gt_list[i][:sep_pos]) right_number = int(sample_gt_list[i][sep_pos + 1:]) if left_number == 0: left_nucleotide = reference_bases else: left_nucleotide = alternative_allele_list[ left_number - 1] if right_number == 0: right_nucleotide = reference_bases else: right_nucleotide = alternative_allele_list[ right_number - 1] if left_nucleotide == right_nucleotide: nucleotide = right_nucleotide else: nucleotide = xlib.get_nucleotide_list_symbol( [left_nucleotide, right_nucleotide]) if nucleotide == '': raise xlib.ProgramException( '', 'D004', 'GT', data_dict['chrom'], data_dict['pos']) except Exception as e: raise xlib.ProgramException( e, 'L008', 'GT', data_dict['chrom'], data_dict['pos']) # if the variant is an INDEL else: nucleotide = '_' # append nucleotide to the sample nucleotide list of a variant sample_nuclotide_list.append(nucleotide) # concat sample nucleotide list of a variant to sequence haplotype list if seq_haplotype_list == []: seq_haplotype_list = sample_nuclotide_list else: for i in range(sample_number): seq_haplotype_list[i] += f'-{sample_nuclotide_list[i]}' # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Total seqs ... {total_seq_counter:8d}' ) # read the next record of VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # append a row to haplotype matrix (rows: sequences; columns: samples) haplotype_matrix.append(seq_haplotype_list) xlib.Message.print('verbose', '\n') # close the VCF file input_vcf_file_id.close() # open the output haplotype file if haplotype_file.endswith('.gz'): try: haplotype_file_id = gzip.open(haplotype_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', haplotype_file) else: try: haplotype_file_id = open(haplotype_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', haplotype_file) ## write header record #header_record = f'sample_id;species_id;{";"'.join(seq_code_list)}\n' #haplotype_file_id.write(header_record) ## write sample records #for i in range(sample_number): # # build the sample haplotype list corresponding to the sample i from the haplotype matrix (rows: sequences; columns: samples) # sample_haplotype_list = [] # for j in range(total_seq_counter): # sample_haplotype_list.append(haplotype_matrix[j][i]) # # write the record of the sample # sample_record = f'{sample_info_list[i][0]};{sample_info_list[i][1]};{";"".join(sample_haplotype_list)}\n' # haplotype_file_id.write(sample_record) # write FASTA sequences per sequence and sample for i in range(total_seq_counter): for j in range(sample_number): # write haplotype identification record haplotype_id_record = f'>{seq_code_list[i]}-{sample_info_list[j][0]}\n' haplotype_file_id.write(haplotype_id_record) #write haplotype sequence record haplotype_seq_record = f'{haplotype_matrix[i][j]}\n' haplotype_file_id.write(haplotype_seq_record) # close file haplotype_file_id.close() # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(haplotype_file)} is created.')
def extract_ff_features(input_gff_file, gff_format, vcf_file, output_gff_file): ''' Extract genomic features from a GFF file corresponding to the variant of a VCF file. ''' # initialize the variant dictionary variant_dict = {} # open the VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # initialize counters record_counter = 0 variant_counter = 0 # read the first record of VCF file (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False) # while there are records in the VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}') # read the next record of the VCF file (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False) # process the column description record if record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}') # read the next record of the VCF file (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False) # process variant records while record != '' and not record.startswith('##') and not record.startswith('#CHROM'): # add 1 to the VCF record counter record_counter += 1 # add 1 to the variant counter variant_counter += 1 # add the sequence and position to the variant dictionary position_list = variant_dict.get(data_dict['chrom'], []) try: pos = int(data_dict['pos']) except Exception as e: raise xlib.ProgramException(e, 'L005', data_dict['chrom'], data_dict['pos']) position_list.append(pos) variant_dict[data_dict['chrom']] = position_list # print the counters xlib.Message.print('verbose', f'\rProcessed VCF records ... {record_counter:8d} - Variants ... {variant_counter:8d}') # read the next record of the VCF file (record, _, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number=0, check_sample_number=False) xlib.Message.print('verbose', '\n') # close VCF file vcf_file_id.close() # open the input GFF file if input_gff_file.endswith('.gz'): try: input_gff_file_id = gzip.open(input_gff_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_gff_file) else: try: input_gff_file_id = open(input_gff_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_gff_file) # open the output GFF file if output_gff_file.endswith('.gz'): try: output_gff_file_id = gzip.open(output_gff_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_gff_file) else: try: output_gff_file_id = open(output_gff_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_gff_file) # initialize counters input_record_counter = 0 output_record_counter = 0 # read the first record record = input_gff_file_id.readline() # while there are records while record != '': # add 1 to input record counter input_record_counter += 1 # process data records if not record.startswith('#'): # extract data # record format: seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tattributes data_list = [] pos_1 = 0 for pos_2 in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[pos_1:pos_2].strip()) pos_1 = pos_2 + 1 data_list.append(record[pos_1:].strip('\n').strip()) try: seq_id = data_list[0] start = int(data_list[3]) end = int(data_list[4]) except Exception as e: raise xlib.ProgramException(e, 'F009', os.path.basename(input_gff_file), record_counter) # get the position of the sequence identification from the variant dictionary position_list = variant_dict.get(seq_id, []) # check if the feature has variants are_there_variants = False found_position_list = [] for position in position_list: if position >= start and position <= end: are_there_variants = True found_position_list.append(str(position)) # if the feature has variants, write in the output file if are_there_variants == True: fragment_id = f'{seq_id[:seq_id.find(".")]}_{"-".join(found_position_list)}' output_record = f'{record.strip()}\t{",".join(found_position_list)}\t{fragment_id}\n' output_gff_file_id.write(output_record) output_record_counter += 1 # print record counter xlib.Message.print('verbose', f'\rGFF file: {input_record_counter} processed records - {output_record_counter} selected records.') # read the next record record = input_gff_file_id.readline() xlib.Message.print('verbose', '\n') # close files input_gff_file_id.close() output_gff_file_id.close()
def impute_adults(input_vcf_file, sample_file, fix, scenario, min_aa_percentage, min_md_imputation_percentage, imputed_md_id, sp1_id, sp1_max_md_percentage, sp2_id, sp2_max_md_percentage, hybrid_id, min_afr_percentage, min_depth, output_vcf_file, tvi_list): ''' Filter and fixes variant data of a VCF file. ''' # initialize the sample number sample_number = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # calculate the adult individual number of both species and hybrids adult_num_1 = 0 adult_num_2 = 0 adult_num_h = 0 for key, value in sample_dict.items(): if value['mother_id'] == 'NONE': if value['species_id'] == sp1_id: adult_num_1 += 1 elif value['species_id'] == sp2_id: adult_num_2 += 1 else: adult_num_h += 1 xlib.Message.print( 'verbose', f'{sp1_id} adults: {adult_num_1} - {sp2_id} adults: {adult_num_2} - hybrid adults: {adult_num_h}\n' ) # initialize the sample species and mother identification lists per variant species_id_list = [] mother_id_list = [] # initialize the non-filtered sequence identification list non_filtered_seq_id_list = [] # set the temporal VCF file temporal_vcf_file = f'{output_vcf_file}.tmp' # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # open the temporal VCF file if temporal_vcf_file.endswith('.gz'): try: temporal_vcf_file_id = gzip.open(temporal_vcf_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', temporal_vcf_file) else: try: temporal_vcf_file_id = open(temporal_vcf_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', temporal_vcf_file) # initialize counters input_record_counter = 0 total_variant_counter = 0 filtered_variant_counter = 0 # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # write the metadata record temporal_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # write the column description record temporal_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) is_indel = False if len(reference_bases) > 1: is_indel = True else: for alternative_allele in alternative_allele_list: if len(alternative_allele) > 1: is_indel = True break if variant_id in tvi_list: xlib.Message.print('trace', f'(1) INDEL?: {is_indel}') # get the combined depth across samples (subfield DP) from field INFO info_field_list = data_dict['info'].upper().split(';') dp = -1 for i in range(len(info_field_list)): if info_field_list[i].startswith('DP='): try: dp = int(info_field_list[i][3:]) except Exception as e: raise xlib.ProgramException(e, 'L008', 'DP', data_dict['chrom'], data_dict['pos']) break if dp == -1: raise xlib.ProgramException('', 'L007', 'DP', data_dict['chrom'], data_dict['pos']) # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) if sample_gt_list[i] not in xlib.get_md_code_list(): try: sample_gt_left_list.append( int(sample_gt_list[i][:sep_pos])) sample_gt_right_list.append( int(sample_gt_list[i][sep_pos + 1:])) except Exception as e: raise xlib.ProgramException(e, 'L008', 'GT', data_dict['chrom'], data_dict['pos']) else: sample_gt_left_list.append(-1) sample_gt_right_list.append(-1) if variant_id in tvi_list: xlib.Message.print('trace', f'(2) reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'(3) alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'(4) sample_gt_list: {sample_gt_list}') # fix the reference base(s) when there are not individual with this reference if fix.upper() == 'Y': # check if there are samples with 0/n or 0|n in their field GT found_0_n = False for i in range(sample_number): if sample_gt_left_list[i] == 0 or sample_gt_right_list[ i] == 0: found_0_n = True break # if there is not any sample with 0/n or 0|n in its field GT if not found_0_n: # change the reference_base(s) and alternative alleles reference_bases = alternative_allele_list[0] alternative_allele_list = alternative_allele_list[1:] alternative_allele_list = [ xlib.get_md_symbol() ] if alternative_allele_list == [] else alternative_allele_list if variant_id in tvi_list: xlib.Message.print( 'trace', '(5) 0 is not found, the reference_bases and alternative_allele_list have been changed.' ) # fix the of the field GT of every sample for i in range(sample_number): if sample_gt_left_list[i] >= 1: sample_gt_left_list[i] -= 1 sample_gt_right_list[i] -= 1 # calculate the alternative allele counter per allele (2 or higher) and species and their percentages aa_counter_list_1 = [] aa_counter_list_2 = [] aa_counter_list_h = [] for _ in range(len(alternative_allele_list)): aa_counter_list_1.append(0) aa_counter_list_2.append(0) aa_counter_list_h.append(0) for i in range(sample_number): if mother_id_list[i] == 'NONE': if sample_gt_right_list[i] >= 2: if species_id_list[i] == sp1_id: aa_counter_list_1[sample_gt_right_list[i] - 1] += 1 elif species_id_list[i] == sp2_id: aa_counter_list_2[sample_gt_right_list[i] - 1] += 1 else: aa_counter_list_h[sample_gt_right_list[i] - 1] += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'(6) aa_counter_list_1: {aa_counter_list_1} - aa_counter_list_2 {aa_counter_list_2} - aa_counter_list_h: {aa_counter_list_h}' ) aa_percentage_list_1 = [] aa_percentage_list_2 = [] aa_percentage_list_h = [] for i in range(len(alternative_allele_list)): aa_percentage_list_1.append(aa_counter_list_1[i] / adult_num_1 * 100) aa_percentage_list_2.append(aa_counter_list_2[i] / adult_num_2 * 100) aa_percentage_list_h.append(aa_counter_list_h[i] / adult_num_h * 100) if hybrid_id != 'NONE' else 0 if variant_id in tvi_list: xlib.Message.print( 'trace', f'(7) aa_percentage_list_1: {aa_percentage_list_1} - aa_percentage_list_2 {aa_percentage_list_2} - aa_percentage_list_h: {aa_percentage_list_h}' ) # fix the GT field of alternative alleles if the alternative allele percentage is less than the minimum percentage in every species when the variant is not a indel if not is_indel: for i in range(sample_number): if sample_gt_right_list[i] >= 2: if (species_id_list[i] == sp1_id and aa_percentage_list_1[sample_gt_right_list[i] - 1] < min_aa_percentage) or \ (species_id_list[i] == sp2_id and aa_percentage_list_2[sample_gt_right_list[i] - 1] < min_aa_percentage) or \ (species_id_list[i] == hybrid_id and aa_percentage_list_h[sample_gt_right_list[i] - 1] < min_aa_percentage): # set missing data if variant_id in tvi_list: xlib.Message.print( 'trace', f'(8) Setting missing data in i: {i} - sample_gt_left_list[i]: {sample_gt_left_list[i]} - sample_gt_right_list[i]: {sample_gt_right_list[i]}' ) sample_gt_left_list[i] = -1 sample_gt_right_list[i] = -1 # fix the alternative allele list when a alternative allele does not have any sample alternative_allele_counter_list = [] for _ in range(len(alternative_allele_list)): alternative_allele_counter_list.append(0) for i in range(sample_number): if sample_gt_left_list[i] > 0: alternative_allele_counter_list[sample_gt_left_list[i] - 1] += 1 if sample_gt_right_list[i] > 0: alternative_allele_counter_list[sample_gt_right_list[i] - 1] += 1 for i in range(len(alternative_allele_counter_list) - 1, -1, -1): if alternative_allele_counter_list[i] == 0: del alternative_allele_list[i] if alternative_allele_list == []: alternative_allele_list = [xlib.get_md_symbol()] else: alternative_allele_list if variant_id in tvi_list: xlib.Message.print( 'trace', f'(9) alternative_allele_counter_list: {alternative_allele_counter_list}' ) # calculate the missing data counter per species and their percentages md_counter_1 = 0 md_counter_2 = 0 md_counter_h = 0 for i in range(sample_number): if mother_id_list[i] == 'NONE': if sample_gt_right_list[i] == -1: if species_id_list[i] == sp1_id: md_counter_1 += 1 elif species_id_list[i] == sp2_id: md_counter_2 += 1 else: md_counter_h += 1 md_percentage_1 = md_counter_1 / adult_num_1 * 100 md_percentage_2 = md_counter_2 / adult_num_2 * 100 md_percentage_h = md_counter_h / adult_num_h * 100 if variant_id in tvi_list: xlib.Message.print( 'trace', f'(10) {sp1_id} missing data: {md_percentage_1:5.2f}% - {sp2_id} missing data: {md_percentage_2:5.2f}% - {hybrid_id} missing data: {md_percentage_h:5.2f}%' ) # when sample is an adult individual, fix the symbol of missing data of the GP field of alternative alleles if the percentage of mising data per species is greater than to the minimum percentage of missing data imputation to a new alternative allele of the corresponding species of the corresponding species or the species is hybrid with the identification of the alternative allele for imputed missing data for i in range(sample_number): # only when the sample is an adult individual if mother_id_list[i] == 'NONE': if sample_gt_right_list[i] == -1 and \ (species_id_list[i] == sp1_id and md_percentage_1 > min_md_imputation_percentage or \ species_id_list[i] == sp2_id and md_percentage_2 > min_md_imputation_percentage or \ species_id_list[i] == hybrid_id and (md_percentage_1 > min_md_imputation_percentage or md_percentage_2 > min_md_imputation_percentage)): sample_gt_left_list[i] = 99 sample_gt_right_list[i] = 99 # get a list with the new order of the alternative alleles new_order_list = [] order = 1 for i in range(len(alternative_allele_counter_list)): if alternative_allele_counter_list[i] > 0: new_order_list.append(order) order += 1 else: new_order_list.append(0) if variant_id in tvi_list: xlib.Message.print('trace', f'(11) new_order_list: {new_order_list}') # check if all samples are monomorphic monomorphic = True left_allele = None right_allele = None for i in range(sample_number): if mother_id_list[i] == 'NONE': if sample_gt_right_list[i] == 99: monomorphic = False break elif sample_gt_right_list[i] != -1: if left_allele == None: left_allele = sample_gt_left_list[i] if right_allele == None: right_allele = sample_gt_right_list[i] if left_allele != sample_gt_left_list[ i] or right_allele != sample_gt_right_list[i]: monomorphic = False break if variant_id in tvi_list: xlib.Message.print('trace', f'(12) monomorphic: {monomorphic}') if variant_id in tvi_list: literal = ' ' for i in range(sample_number): literal += f'{str(sample_gt_left_list[i])}{sample_sep_list[i]}{str(sample_gt_right_list[i])} ' xlib.Message.print( 'trace', f'(13) genotype list before imputation revision: {literal}' ) # review depending on the scenario for i in range(sample_number): # only when the sample is an adult individual if mother_id_list[i] == 'NONE': # revision when the scenario is '0' (no imputation) or '2' (maximum possible imputation) if scenario in ['0', '2']: # the sample is hybrid if species_id_list[i] == hybrid_id and ( md_percentage_1 > min_md_imputation_percentage or md_percentage_2 > min_md_imputation_percentage ) and sample_gt_left_list[i] == sample_gt_right_list[i]: sample_gt_right_list[i] = 99 elif species_id_list[i] == sp1_id and ( md_percentage_1 > min_md_imputation_percentage ) and sample_gt_left_list[i] == sample_gt_right_list[i]: sample_gt_right_list[i] = 99 elif species_id_list[i] == sp2_id and ( md_percentage_2 > min_md_imputation_percentage ) and sample_gt_left_list[i] == sample_gt_right_list[i]: sample_gt_right_list[i] = 99 # revision when the scenario is '1' (standard) elif scenario == '1': #if sample_gt_right_list[i] == -1: # sample_gt_left_list[i] = 99 # sample_gt_right_list[i] = 99 pass # revision when the scenario is '3' (maximum possible missing data) elif scenario == '3': if sample_gt_left_list[i] == sample_gt_right_list[i]: sample_gt_right_list[i] = -1 if variant_id in tvi_list: literal = ' ' for i in range(sample_number): literal += f'{str(sample_gt_left_list[i])}{sample_sep_list[i]}{str(sample_gt_right_list[i])} ' xlib.Message.print( 'trace', f'(14) genotype list after imputation revision: {literal}' ) # rebuild the list of the field GT for every sample for i in range(sample_number): if sample_gt_left_list[i] == -1: left = xlib.get_md_symbol() elif sample_gt_left_list[i] == 99: left = imputed_md_id else: left = new_order_list[ sample_gt_left_list[i] - 1] if sample_gt_left_list[i] > 0 else 0 if sample_gt_right_list[i] == -1: right = xlib.get_md_symbol() elif sample_gt_right_list[i] == 99: right = imputed_md_id else: right = new_order_list[ sample_gt_right_list[i] - 1] if sample_gt_right_list[i] > 0 else 0 sample_gt_left_list[i] = left sample_gt_right_list[i] = right sample_gt_list[ i] = f'{sample_gt_left_list[i]}{sample_sep_list[i]}{sample_gt_right_list[i]}' # rebuild the alternative alleles and its corresponding record data alternative_alleles = ','.join(alternative_allele_list) # rebuild the sample genotype data list and their corresponding record data sample_list = [] for i in range(sample_number): sample_data_list[gt_position] = sample_gt_list[i] sample_list.append(':'.join(sample_data_list)) if variant_id in tvi_list: xlib.Message.print('trace', f'(15) reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'(16) alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'(17) sample_gt_list: {sample_gt_list}') # check the allele frecuencies when the variant is not a indel allele_frequency_OK = True if not is_indel: # get the allele counters per species allele_counter_dict_1 = {} allele_counter_dict_2 = {} allele_counter_dict_h = {} for i in range(sample_number): if mother_id_list[i] == 'NONE': if sample_gt_right_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_left_list[ i]] = allele_counter_dict_1.get( sample_gt_left_list[i], 0) + 1 allele_counter_dict_1[sample_gt_right_list[ i]] = allele_counter_dict_1.get( sample_gt_right_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_left_list[ i]] = allele_counter_dict_2.get( sample_gt_left_list[i], 0) + 1 allele_counter_dict_2[sample_gt_right_list[ i]] = allele_counter_dict_2.get( sample_gt_right_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_left_list[ i]] = allele_counter_dict_h.get( sample_gt_left_list[i], 0) + 1 allele_counter_dict_h[sample_gt_right_list[ i]] = allele_counter_dict_h.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'(18) allele_counter_dict_1: {allele_counter_dict_1}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'(19) allele_counter_dict_2: {allele_counter_dict_2}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'(20) allele_counter_dict_h: {allele_counter_dict_h}') # check the allele frecuencies per species if imputed_md_id in allele_counter_dict_1.keys() and len(allele_counter_dict_1.keys()) > 3 or \ imputed_md_id not in allele_counter_dict_1.keys() and len(allele_counter_dict_1.keys()) > 2 or \ imputed_md_id in allele_counter_dict_2.keys() and len(allele_counter_dict_2.keys()) > 3 or \ imputed_md_id not in allele_counter_dict_2.keys() and len(allele_counter_dict_2.keys()) > 2: allele_frequency_OK = False if variant_id in tvi_list: xlib.Message.print('trace', '(21) multiallelic variant.') else: sp1_allele_total = 0 for allele in allele_counter_dict_1.keys(): sp1_allele_total += allele_counter_dict_1[allele] for allele in allele_counter_dict_1.keys(): allele_frequency = allele_counter_dict_1[ allele] / sp1_allele_total * 100 if allele_frequency < min_afr_percentage: allele_frequency_OK = False if variant_id in tvi_list: xlib.Message.print( 'trace', f'(20) allele {allele} in species 1 has a frequency {allele_frequency:5.2f}% less than maf' ) sp2_allele_total = 0 for allele in allele_counter_dict_2.keys(): sp2_allele_total += allele_counter_dict_2[allele] for allele in allele_counter_dict_2.keys(): allele_frequency = allele_counter_dict_2[ allele] / sp2_allele_total * 100 if allele_counter_dict_2[ allele] / sp2_allele_total * 100 < min_afr_percentage: allele_frequency_OK = False if variant_id in tvi_list: xlib.Message.print( 'trace', f'(21) allele {allele} in species 2 has a frequency {allele_frequency:5.2f}% less than maf' ) # check if there are imputation in adult individuals when the scenario is 0 (no imputation) scenario0_are_there_imputations = False if scenario == '0': for i in range(sample_number): if mother_id_list[i] == 'NONE' and ( sample_gt_left_list[i] == imputed_md_id or sample_gt_right_list[i] == imputed_md_id): scenario0_are_there_imputations = True break # if DP is less than the minimum combined depth or all samples are monomorphic or the missing data percentage is greater than or equal to the missing data percentage threshold in both species or allele frequency is not OK if variant_id in tvi_list: xlib.Message.print( 'trace', f'(22) dp: {dp} - md_percentage_1: {md_percentage_1:5.2f}% - md_percentage_2: {md_percentage_2:5.2f}% - allele_frequency_OK: {allele_frequency_OK}' ) if dp < min_depth or monomorphic or ( md_percentage_1 > sp1_max_md_percentage and md_percentage_2 > sp2_max_md_percentage ) or not allele_frequency_OK or scenario0_are_there_imputations: # add 1 to the filtered variant counter filtered_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print('trace', '(23) This variant is deleted!!!') # in any other case else: # add the sequence identification to the non filtered sequence identification list if data_dict['chrom'] not in non_filtered_seq_id_list: non_filtered_seq_id_list.append(data_dict['chrom']) # write the variant record sample_list_text = '\t'.join(sample_list) temporal_vcf_file_id.write( f'{data_dict["chrom"]}\t{data_dict["pos"]}\t{data_dict["id"]}\t{reference_bases}\t{alternative_alleles}\t{data_dict["qual"]}\t{data_dict["filter"]}\t{data_dict["info"]}\t{data_dict["format"]}\t{sample_list_text}\n' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Filtered variants ... {filtered_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() temporal_vcf_file_id.close() # print OK message xlib.Message.print( 'info', f'The temporal file {os.path.basename(temporal_vcf_file)} containing the filtered variants is created.' ) xlib.Message.print('info', 'Removing metadata of filtered variants ...') # open the temporal VCF file if temporal_vcf_file.endswith('.gz'): try: temporal_vcf_file_id = gzip.open(temporal_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', temporal_vcf_file) else: try: temporal_vcf_file_id = open(temporal_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', temporal_vcf_file) # open the output VCF file if output_vcf_file.endswith('.gz'): try: output_vcf_file_id = gzip.open(output_vcf_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_vcf_file) else: try: output_vcf_file_id = open(output_vcf_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_vcf_file) # read the first record of temporal VCF file record = temporal_vcf_file_id.readline() # while there are records in temporal VCF file while record != '': # process contig records if record.startswith('##contig'): # get the sequence identification and the position seq_id = '' i1 = 13 i2 = record.find(',', i1) if i2 > -1: seq_id = record[i1:i2] # write the record when the sequence identification was not filtered if seq_id in non_filtered_seq_id_list: output_vcf_file_id.write(record) # process other records else: # write record output_vcf_file_id.write(record) # read the next record record = temporal_vcf_file_id.readline() # close files temporal_vcf_file_id.close() output_vcf_file_id.close() # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(output_vcf_file)} containing the filtered variants is created.' ) # delete temporal VCF file os.remove(temporal_vcf_file) xlib.Message.print( 'info', f'The temporal VCF file {os.path.basename(temporal_vcf_file)} is deleted.' )