Esempio n. 1
0
def mutations_to_vcf_file(mutations, out_variant):
    vcf_file = []
    for variant in mutations:
        vcf_file.append([
            "NC_045512.2", variant[1:len(variant) - 1], ".",
            variant[0].upper(), variant[len(variant) - 1].upper(), "100.0",
            "PASS", "INFO"
        ])

    df_vcf_file = pd.DataFrame(vcf_file)
    df_vcf_file[1] = df_vcf_file[1].astype(int)
    df_vcf_file = df_vcf_file.drop_duplicates()
    df_vcf_file = df_vcf_file.sort_values(1)
    df_vcf_file.to_csv(out_variant, sep='\t', header=None, index=False)

    cmd = "rm -f %s" % (out_variant + ".snpeff")
    bashCommunicator(cmd)
    print("Delete old annotation file")
    cmd = "java -Xmx4g -jar /storage/apps/opt/snpEff/snpEff_covid/snpEff.jar -v   NC_045512.2  %s > %s " % (
        out_variant, out_variant + ".snpeff")
    bashCommunicator(cmd)
    print("Generating new annotation file")
Esempio n. 2
0
def add_deletions_to_vcf(df, out_variant):

    del_list = collect_deletions(df)

    del_list_withstrain = deletions_for_vcf(del_list)

    reference_file = "/storage/scratch/covid/container/reference/MN908947.fasta"
    reference = SeqIO.read(reference_file, "fasta")

    vcf_file = []

    for del_entry in del_list_withstrain:
        strain = del_entry[0]
        del_dict = del_entry[1]
        for coordinate in del_dict:
            deleted_nt = del_dict[coordinate]
            position = coordinate - 1
            ref = reference[position - 1] + deleted_nt
            alt = reference[position - 1]
            vcf_file.append([
                "NC_045512.2", position, ".", ref, alt, "100.0", strain, "INFO"
            ])

    df_vcf_file = pd.DataFrame(vcf_file)
    df_vcf_file[1] = df_vcf_file[1].astype(int)
    df_vcf_file = df_vcf_file.drop_duplicates()
    df_vcf_file = df_vcf_file.sort_values(1)
    df_vcf_file.to_csv(out_variant, sep='\t', header=None, index=False)

    cmd = "rm -f %s" % (out_variant + ".snpeff")
    bashCommunicator(cmd)
    print("Delete old Deletion annotation file")
    cmd = "java -Xmx4g -jar /storage/apps/opt/snpEff/snpEff_covid/snpEff.jar -v   NC_045512.2  %s > %s " % (
        out_variant, out_variant + ".snpeff")
    bashCommunicator(cmd)
    print("Generating new Deletion annotation file")
Esempio n. 3
0
def generate_mcov_db(ref_dir):

    orders = "orders.csv"
    patients = "patients.csv"
    sample_log = "CoVID_Sample_Log.xlsx"
    out_file = "4_Curated_MCOV_MRN_Strains.xlsx"
    #######################
    bashCommunicator("wget -P " + ref_dir + " ")
    df_orders = pd.read_csv(ref_dir + orders,
                            dtype={
                                'MRN': str,
                                'ORDER_ID': str
                            },
                            low_memory=False)
    df_orders.drop_duplicates("ORDER_ID", inplace=True)
    df_orders = df_orders[[
        'MRN', 'ORDER_ID', 'COLLECTION_DT', 'ORDERING_CLINIC_ID',
        'ORDERING_CLINIC_NAME', 'FACILITY', 'ADMISSION_DT', 'DISCHARGE_DT',
        'HIS_PATIENT_TYPE'
    ]]

    bashCommunicator("wget -P " + ref_dir + " ")
    df_patients = pd.read_csv(ref_dir + patients,
                              dtype={
                                  'MRN': str,
                                  'ORDER.ORDER_ID': str
                              },
                              low_memory=False)
    df_patients = df_patients[['MRN', 'ZIP', 'ORDER.ORDER_ID']]
    df_patients.rename(columns={'ORDER.ORDER_ID': 'ORDER_ID'}, inplace=True)
    df_patients.drop_duplicates("ORDER_ID", inplace=True)
    df_patients['ZIP'] = [str(x)[0:5] for x in df_patients['ZIP']]

    bashCommunicator("cp " + params["container"] + "temp/" + sample_log + " " +
                     ref_dir)
    df = pd.read_excel(ref_dir + sample_log)
    df = df[['Musser Lab No.', 'Full Order Number.2']]
    df.columns = ['MCoVNumber', 'ORDER_ID']

    dfjoin = pd.merge(df_orders,
                      df_patients[['MRN', 'ZIP']],
                      on='MRN',
                      how='left')
    dfjoin = pd.merge(df, dfjoin, on='ORDER_ID', how='left', indicator=True)
    dfjoin.rename(columns={'_merge': 'sequence_order_patient_data_match'},
                  inplace=True)
    dfjoin.to_excel(ref_dir + out_file, index=False)
Esempio n. 4
0
def generate_batch_alignment(REFERENCE, file_dir, out_dir):

    all_aligned_fasta_files = []
    all_fasta_files = []
    for fasta_files in Path(file_dir).glob('*.fasta'):
        all_fasta_files.append(fasta_files.name)

    df_all_strains = pd.DataFrame(all_fasta_files)

    CHUNK_SIZE = 100

    index_chunks = chunked(df_all_strains.index, CHUNK_SIZE)

    counter = 0
    for ii in index_chunks:
        counter += 1
        for fasta_file in df_all_strains.iloc[ii].values:
            cmd = "cat %s >> %s " % (file_dir + fasta_file[0],
                                     out_dir + str(counter) + "_merged.fasta")
            bashCommunicator(cmd)

        cmd = "cat %s  %s > %s " % (out_dir + REFERENCE + ".fasta", out_dir +
                                    str(counter) + "_merged.fasta", out_dir +
                                    str(counter) + "_MN908947_merged.fasta")

        bashCommunicator(cmd)

        cmd = " mafft --thread 24 --reorder --retree 1  %s > %s " % (
            out_dir + str(counter) + "_MN908947_merged.fasta",
            out_dir + str(counter) + "_MN908947_merged.mafft_algn.fasta")

        print("running mafft aligner ..." + str(counter))
        bashCommunicator(cmd)

        all_aligned_fasta_files.append(
            str(counter) + "_MN908947_merged.mafft_algn.fasta")

    return all_aligned_fasta_files
Esempio n. 5
0
    ]

    dfjoin.groupby(
        ["run_group_analysis",
         "run_id_seq"]).agg('count')["MCoVNumber"].reset_index().sort_values(
             "run_id_seq").to_excel(ref_dir + "run_samplesheet_match_.xlsx",
                                    index=False)

    dfjoin.sort_values("run_id", inplace=True)

    dfjoin.drop_duplicates("MCoVNumber", keep="last", inplace=True)

    dfjoin.to_csv(ref_dir +
                  "5_strains_run_group_analysis_table_with_samplesheet.csv",
                  index=False)


params = read_run_params()
run = params["current_run"]
bashCommunicator("mkdir -p " + params["container"] + "output/" + run +
                 "/database/")
bashCommunicator("cp " + params["container"] + "temp/" + run + ".wuhan1.fa " +
                 params["container"] + "input_alignment/ ")
bashCommunicator("cp " + params["container"] + "temp/" + run +
                 "_set1_SampleSheet.csv " + params["container"] +
                 "input_samplesheet/ ")
ref_dir = params["container"] + "output/" + run + "/database/"

generate_mcov_db(ref_dir)
generate_strain_ids(params, ref_dir)
generate_rungroup_from_samplesheet(ref_dir)
import pandas as pd
from gen_utils.gen_io import read_run_params, log_msg, bashCommunicator

params = read_run_params()
home_dir = params["container"] + "variant_analysis/"
runs = params["runs"]
current_run = params["current_run"]

bashCommunicator("cp " + params["container"] + "temp/" + current_run +
                 ".var " + home_dir + "runs/ ")

df = pd.DataFrame()
for run in runs:
    if run in ["run_old", "run0_10", "run11_15"]:
        continue
    else:
        print("processing..." + run)
        df_cur = pd.read_csv(home_dir + "runs/" + run + ".var",
                             header=None,
                             sep='\t')
        df_cur.columns = ['strain', 'num1', 'num2', 'spike-var']
        df_cur = df_cur[['strain', 'spike-var']]
        df_cur["run"] = run
        df = df.append(df_cur, ignore_index=True)

df.to_excel(home_dir + "final_result_with_var_assignment_" + current_run +
            ".xlsx",
            index=False)