def mutations_to_vcf_file(mutations, out_variant): vcf_file = [] for variant in mutations: vcf_file.append([ "NC_045512.2", variant[1:len(variant) - 1], ".", variant[0].upper(), variant[len(variant) - 1].upper(), "100.0", "PASS", "INFO" ]) df_vcf_file = pd.DataFrame(vcf_file) df_vcf_file[1] = df_vcf_file[1].astype(int) df_vcf_file = df_vcf_file.drop_duplicates() df_vcf_file = df_vcf_file.sort_values(1) df_vcf_file.to_csv(out_variant, sep='\t', header=None, index=False) cmd = "rm -f %s" % (out_variant + ".snpeff") bashCommunicator(cmd) print("Delete old annotation file") cmd = "java -Xmx4g -jar /storage/apps/opt/snpEff/snpEff_covid/snpEff.jar -v NC_045512.2 %s > %s " % ( out_variant, out_variant + ".snpeff") bashCommunicator(cmd) print("Generating new annotation file")
def add_deletions_to_vcf(df, out_variant): del_list = collect_deletions(df) del_list_withstrain = deletions_for_vcf(del_list) reference_file = "/storage/scratch/covid/container/reference/MN908947.fasta" reference = SeqIO.read(reference_file, "fasta") vcf_file = [] for del_entry in del_list_withstrain: strain = del_entry[0] del_dict = del_entry[1] for coordinate in del_dict: deleted_nt = del_dict[coordinate] position = coordinate - 1 ref = reference[position - 1] + deleted_nt alt = reference[position - 1] vcf_file.append([ "NC_045512.2", position, ".", ref, alt, "100.0", strain, "INFO" ]) df_vcf_file = pd.DataFrame(vcf_file) df_vcf_file[1] = df_vcf_file[1].astype(int) df_vcf_file = df_vcf_file.drop_duplicates() df_vcf_file = df_vcf_file.sort_values(1) df_vcf_file.to_csv(out_variant, sep='\t', header=None, index=False) cmd = "rm -f %s" % (out_variant + ".snpeff") bashCommunicator(cmd) print("Delete old Deletion annotation file") cmd = "java -Xmx4g -jar /storage/apps/opt/snpEff/snpEff_covid/snpEff.jar -v NC_045512.2 %s > %s " % ( out_variant, out_variant + ".snpeff") bashCommunicator(cmd) print("Generating new Deletion annotation file")
def generate_mcov_db(ref_dir): orders = "orders.csv" patients = "patients.csv" sample_log = "CoVID_Sample_Log.xlsx" out_file = "4_Curated_MCOV_MRN_Strains.xlsx" ####################### bashCommunicator("wget -P " + ref_dir + " ") df_orders = pd.read_csv(ref_dir + orders, dtype={ 'MRN': str, 'ORDER_ID': str }, low_memory=False) df_orders.drop_duplicates("ORDER_ID", inplace=True) df_orders = df_orders[[ 'MRN', 'ORDER_ID', 'COLLECTION_DT', 'ORDERING_CLINIC_ID', 'ORDERING_CLINIC_NAME', 'FACILITY', 'ADMISSION_DT', 'DISCHARGE_DT', 'HIS_PATIENT_TYPE' ]] bashCommunicator("wget -P " + ref_dir + " ") df_patients = pd.read_csv(ref_dir + patients, dtype={ 'MRN': str, 'ORDER.ORDER_ID': str }, low_memory=False) df_patients = df_patients[['MRN', 'ZIP', 'ORDER.ORDER_ID']] df_patients.rename(columns={'ORDER.ORDER_ID': 'ORDER_ID'}, inplace=True) df_patients.drop_duplicates("ORDER_ID", inplace=True) df_patients['ZIP'] = [str(x)[0:5] for x in df_patients['ZIP']] bashCommunicator("cp " + params["container"] + "temp/" + sample_log + " " + ref_dir) df = pd.read_excel(ref_dir + sample_log) df = df[['Musser Lab No.', 'Full Order Number.2']] df.columns = ['MCoVNumber', 'ORDER_ID'] dfjoin = pd.merge(df_orders, df_patients[['MRN', 'ZIP']], on='MRN', how='left') dfjoin = pd.merge(df, dfjoin, on='ORDER_ID', how='left', indicator=True) dfjoin.rename(columns={'_merge': 'sequence_order_patient_data_match'}, inplace=True) dfjoin.to_excel(ref_dir + out_file, index=False)
def generate_batch_alignment(REFERENCE, file_dir, out_dir): all_aligned_fasta_files = [] all_fasta_files = [] for fasta_files in Path(file_dir).glob('*.fasta'): all_fasta_files.append(fasta_files.name) df_all_strains = pd.DataFrame(all_fasta_files) CHUNK_SIZE = 100 index_chunks = chunked(df_all_strains.index, CHUNK_SIZE) counter = 0 for ii in index_chunks: counter += 1 for fasta_file in df_all_strains.iloc[ii].values: cmd = "cat %s >> %s " % (file_dir + fasta_file[0], out_dir + str(counter) + "_merged.fasta") bashCommunicator(cmd) cmd = "cat %s %s > %s " % (out_dir + REFERENCE + ".fasta", out_dir + str(counter) + "_merged.fasta", out_dir + str(counter) + "_MN908947_merged.fasta") bashCommunicator(cmd) cmd = " mafft --thread 24 --reorder --retree 1 %s > %s " % ( out_dir + str(counter) + "_MN908947_merged.fasta", out_dir + str(counter) + "_MN908947_merged.mafft_algn.fasta") print("running mafft aligner ..." + str(counter)) bashCommunicator(cmd) all_aligned_fasta_files.append( str(counter) + "_MN908947_merged.mafft_algn.fasta") return all_aligned_fasta_files
] dfjoin.groupby( ["run_group_analysis", "run_id_seq"]).agg('count')["MCoVNumber"].reset_index().sort_values( "run_id_seq").to_excel(ref_dir + "run_samplesheet_match_.xlsx", index=False) dfjoin.sort_values("run_id", inplace=True) dfjoin.drop_duplicates("MCoVNumber", keep="last", inplace=True) dfjoin.to_csv(ref_dir + "5_strains_run_group_analysis_table_with_samplesheet.csv", index=False) params = read_run_params() run = params["current_run"] bashCommunicator("mkdir -p " + params["container"] + "output/" + run + "/database/") bashCommunicator("cp " + params["container"] + "temp/" + run + ".wuhan1.fa " + params["container"] + "input_alignment/ ") bashCommunicator("cp " + params["container"] + "temp/" + run + "_set1_SampleSheet.csv " + params["container"] + "input_samplesheet/ ") ref_dir = params["container"] + "output/" + run + "/database/" generate_mcov_db(ref_dir) generate_strain_ids(params, ref_dir) generate_rungroup_from_samplesheet(ref_dir)
import pandas as pd from gen_utils.gen_io import read_run_params, log_msg, bashCommunicator params = read_run_params() home_dir = params["container"] + "variant_analysis/" runs = params["runs"] current_run = params["current_run"] bashCommunicator("cp " + params["container"] + "temp/" + current_run + ".var " + home_dir + "runs/ ") df = pd.DataFrame() for run in runs: if run in ["run_old", "run0_10", "run11_15"]: continue else: print("processing..." + run) df_cur = pd.read_csv(home_dir + "runs/" + run + ".var", header=None, sep='\t') df_cur.columns = ['strain', 'num1', 'num2', 'spike-var'] df_cur = df_cur[['strain', 'spike-var']] df_cur["run"] = run df = df.append(df_cur, ignore_index=True) df.to_excel(home_dir + "final_result_with_var_assignment_" + current_run + ".xlsx", index=False)