for file in file_list: base_file_name = os.path.splitext(file)[0] annotation_input_file = base_file_name + '_HGVS.temp' output_file = base_file_name + '.vcf' # Clean LOVD data for VCF lovd_file = pd.read_csv(file, sep=COLUMN_DELIMITER) lovd_file = vcf.remove_malformed_fields(lovd_file) # Output VCF variants for annotation column_list = [ 'dna_change', 'protein_change', 'var_pub_as', 'rna_change', 'db_id', 'variant_remarks', 'reference', 'frequency' ] vcf_format = vcf.convert_to_vcf_format(lovd_file[column_list], rm, 'dna_change', 'LOVD') vcf_column_order = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO' ] with open(annotation_input_file, 'w') as f: vcf_header = [ '##fileformat=VCFv4.0', vcf.get_vcf_info_header(lovd_file[column_list], 'LOVD', 'Data from LOVD'), '#' + '\t'.join(vcf_column_order) ] f.write('\n'.join(vcf_header) + '\n') vcf_format[vcf_column_order].to_csv(f, sep=COLUMN_DELIMITER,
rm = VariantRemapper() for file in file_list: base_file_name = os.path.splitext(file)[0] annotation_input_file = base_file_name + '_HGVS.temp' output_file = base_file_name + '.vcf' # Clean LOVD data for VCF lovd_file = pd.read_csv(file, sep=COLUMN_DELIMITER) lovd_file = vcf.remove_malformed_fields(lovd_file) # Output VCF variants for annotation column_list = ['dna_change', 'protein_change', 'var_pub_as', 'rna_change', 'db_id', 'variant_remarks', 'reference', 'frequency'] vcf_format = vcf.convert_to_vcf_format(lovd_file[column_list], rm, 'dna_change', 'LOVD') vcf_column_order = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] with open(annotation_input_file, 'w') as f: vcf_header = ['##fileformat=VCFv4.0', vcf.get_vcf_info_header(lovd_file[column_list], 'LOVD', 'Data from LOVD'), '#' + '\t'.join(vcf_column_order) ] f.write('\n'.join(vcf_header) + '\n') vcf_format[vcf_column_order].to_csv(f, sep=COLUMN_DELIMITER, header=False, index=False) # Annotate with VEP annotate_vcf.annotate_vep(annotation_input_file, output_file) # Combine VEP VCF output with original LOVD data