def merge_paired_files(input_dir, out_dir): """ :param input_dir: tmp directory path of the cirseq pipeline analysis :param out_dir: the output directory path :return: merged files """ # print(input_dir) files1 = glob.glob(input_dir + "/*_*_*_R1_001.fastq") print(files1) lst_files = [] for fastq1 in files1: filename = os.path.basename(fastq1) print(filename) #NextSeq # lane = filename.split("_")[1] sample = filename.split("_")[0] + "_" + filename.split( "_")[1] + "_" + filename.split("_")[2] print(sample) fastq2 = ("%s/%s_R2_001.fastq") % (input_dir, sample) if not os.path.exists(out_dir): out_dir = os.system("mkdir %s" % out_dir) output_file = "%s/%s_merged.fastq" % (out_dir, sample) pbs_runners.script_runner( "python /sternadi/home/volume3/okushnir/SternLab/scripts/merge_fastq_files.py -f %s -e %s -o %s -r 60" % (fastq1, fastq2, output_file), alias="merge_RV")
def run_pipeline_vs_founder(pipeline): for sample in pipeline: freqs_filename = sample.split('_')[0] + '.freqs' output_filename = sample + '.fasta' script_runner( 'python /sternadi/home/volume3/omer/SternLab/NGS_analysis/make_reference_from_consensus.py ' '-f sternadi/home/volume1/shared/data/ref_genomes/HXB2.fasta ' '-p /sternadi/home/volume1/shared/analysis/HIV_ravi_gupta/run2/$sample/$freqs_filename ' '-o /sternadi/home/volume1/shared/analysis/HIV_ravi_gupta/refs/$output_filename ' '-i $PBS_ARRAY_INDEX', jnum=76, alias='make_ref_from_con_{}'.format(sample), load_python=True)
def index(csv_file, fastq_path, output_dir): indexing = pd.read_csv(csv_file) # your_csv_dir sample_id = list(indexing.SampleID) index_1 = list(indexing.Index1Sequence) index_2 = list(indexing.Index2Sequence) index_dic = {} for i in range(len(sample_id)): index_dic[index_1[i] + "+" + index_2[i]] = sample_id[i] files = glob.glob(fastq_path) for f in files: for key in index_dic.keys(): output = index_dic[key] + f.split("/")[-1].split(".fastq")[0].split("S")[1] + ".fastq" output_file = output_dir + output pbs_runners.script_runner("grep '%s' %s -A 3 > %s" % (key, f, output_file), alias="OST_Sample")
def main(): # parser = OptionParser("usage: %prog [options]") # parser.add_option("-i", "--input_file", dest="input_file", help="fastq file") # parser.add_option("-o", "--output_dir", dest="output_dir", help="output dir") # (options, args) = parser.parse_args() # file = options.input_file # output_dir = options.output_dir # 1st thing to do is to index the output files from the Nextseq # csv_file = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/index.csv" # fastq_path = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/" # output_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/indexed" # index(csv_file, fastq_path, output_dir) # 2nd is to clean the files from -- # trim(file, output_dir) # input_dir = ("/sternadi/datasets/volume1/180503_OST_FINAL_03052018/indexed/") # output_dir = ("/sternadi/home/volume3/okushnir/AccuNGS/180503_OST_FINAL_03052018/sheri_clean/") # files = glob.glob(input_dir + "*.fastq") # for f in files: # trim(f, output_dir) # 3rd merge the files # one by one approach # sample = "CVB3_p2_L001-ds.7381dd36e2ce42768db0cc15d5c79a31" # input_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/clean/" + sample # merge_output_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/merged/" # merge_paired_files(input_dir, merge_output_dir) # automatic approach # input_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190807_RV_p2_p10/clean/" # merge_input_dir = glob.glob(input_dir+"*") # for dir in merge_input_dir: # target = dir.split("/")[-1] # # print(target) # merge_output_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190807_RV_p2_p10/merged/" + target # print(merge_output_dir) # try: # os.mkdir(merge_output_dir) # except OSError: # print("Creation of the directory %s failed" % merge_output_dir) # else: # print("Successfully created the directory %s " % merge_output_dir) # print("merging...") # merge_paired_files(dir, merge_output_dir) # # # # 4th run pipeline: input_dir_OPV_33 = "/sternadi/datasets/volume1/sabin2/33mixedMOI" ref_opv_33 = "/sternadi/datasets/volume1/sabin2/OPV2_reference_AY184220.fasta" folders = glob.glob(input_dir_OPV_33 + "/*") for passage in folders: input_dir = passage passage = passage.split("/")[-1] output_dir = "/sternadi/home/volume3/okushnir/Cirseq/PV/OPV/%s" % passage try: os.mkdir(output_dir) except OSError: print("Creation of the directory %s failed" % output_dir) else: print("Successfully created the directory %s " % output_dir) output_dir = "/sternadi/home/volume3/okushnir/Cirseq/PV/OPV/%s/q30_3UTR" % passage try: os.mkdir(output_dir) except OSError: print("Creation of the directory %s failed" % output_dir) else: print("Successfully created the directory %s " % output_dir) cmd = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r %s -NGS_or_Cirseq 2 -rep 2 -q 30 -t z" % ( input_dir, output_dir, ref_opv_33) pbs_runners.script_runner(cmd, alias="pipeline_OPV")
""" @Author: odedkushnir """ import os import pbs_runners srr = "SRR349755" ref = "JN562723.SRR349755.fasta" folder = "/sternadi/home/volume3/okushnir/SRP/%s/fastq" % (srr) # for d in folders: output_dir = "/sternadi/home/volume3/okushnir/SRP/%s/q30_consensus_1e-03" % ( srr) if not os.path.exists(output_dir): os.mkdir(output_dir) cmd = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r /sternadi/home/volume3/" \ "okushnir/SRP/%s -NGS_or_Cirseq 1 -rep 1 -q 30 -ev 0.001" % (folder, output_dir, ref) pbs_runners.script_runner(cmd, alias="pipeline_d")
def main(): # parser = OptionParser("usage: %prog [options]") # parser.add_option("-i", "--input_file", dest="input_file", help="fastq file") # parser.add_option("-o", "--output_dir", dest="output_dir", help="output dir") # (options, args) = parser.parse_args() # file = options.input_file # output_dir = options.output_dir """1st thing to do is to index the output files from the Nextseq""" # csv_file = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/index.csv" # fastq_path = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/" # output_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/RNAseq/180725_M04473_0026_000000000-BT9GF/indexed" # index(csv_file, fastq_path, output_dir) """2nd is to clean the files from --""" # trim(file, output_dir) # input_dir = ("/sternadi/datasets/volume1/180503_OST_FINAL_03052018/indexed/") # output_dir = ("/sternadi/home/volume3/okushnir/AccuNGS/180503_OST_FINAL_03052018/sheri_clean/") # files = glob.glob(input_dir + "*.fastq") # for f in files: # trim(f, output_dir) """MiSeq""" """PrimerID""" """Option 1""" # /sternadi/home/volume3/okushnir/Cluster_Scripts/extract-primer-ids20201016_I.cmd # /sternadi/home/volume3/okushnir/Cluster_Scripts/extract-primer-ids20201016_II.cmd # CHANGE THE PARAMETERS """Option 2""" #1 # /sternadi/home/volume3/okushnir/Cluster_Scripts/extract-primer-ids20201016_I.cmd # /sternadi/home/volume3/okushnir/Cluster_Scripts/extract-primer-ids20201016_II.cmd # CHANGE THE PARAMETERS #2 # /sternadi/home/volume3/okushnir/Cluster_Scripts/barcode.cmd """3rd merge the files""" # one by one approach # sample = "CVB3_p2_L001-ds.7381dd36e2ce42768db0cc15d5c79a31" # input_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/clean/" + sample # merge_output_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/merged/" # merge_paired_files(input_dir, merge_output_dir) """automatic approach""" # input_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190807_RV_p2_p10/clean/" # merge_input_dir = glob.glob(input_dir+"*") # for dir in merge_input_dir: # target = dir.split("/")[-1] # merge_output_dir = "/sternadi/home/volume3/okushnir/AccuNGS/190807_RV_p2_p10/merged/" + target # print(merge_output_dir) # try: # os.mkdir(merge_output_dir) # except OSError: # print("Creation of the directory %s failed" % merge_output_dir) # else: # print("Successfully created the directory %s " % merge_output_dir) # print("merging...") # merge_paired_files(dir, merge_output_dir) """4th run pipeline:""" input_dir_RV_p7 = "/sternadi/home/volume3/okushnir/AccuNGS/190217_RV_p7/merged" input_dir_RV_new = "/sternadi/home/volume3/okushnir/AccuNGS/190807_RV_p2_p10/merged" input_dir_patients = "/sternadi/home/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/patients" input_dir_RV = "/sternadi/home/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages" input_dir_CV = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/merged/CVB3" input_dir_ATCG1 = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/merged/ATCG1" input_dir_FLNA = "/sternadi/home/volume3/okushnir/AccuNGS/190627_RV_CV/merged/FLNA" ref_rv = "/sternadi/home/volume3/okushnir/ref/RVB14/pWR3.26_1-7212.fasta" ref_cv = "/sternadi/home/volume3/okushnir/ref/CVB3/CVB3_from_pT7CVB3_1-7417.fasta" ref_atcg1 = "/sternadi/home//volume3/okushnir/ref/human/NM_001614.5_ACTG1.fa" ref_flna1 = "/sternadi/home/volume3/okushnir/ref/human/FLNA.fasta" "Passages" passages_lst = ["p2_1", "p2_2", "p2_3", "p5_1", "p5_2", "p5_3", "p8_1", "p8_2", "p8_3", "p10_1", "p10_2", "p10_3", "p12_1", "p12_2", "p12_3"] q = "38" for passage in passages_lst: folders = glob.glob(input_dir_RV + "/%s/fits_pipeline" % passage) # print(folders) for d in folders: output_dir = (d + "/20201214_q%s" % q) # print(output_dir) try: os.mkdir(output_dir) except OSError: print("Creation of the directory %s failed" % output_dir) else: print("Successfully created the directory %s " % output_dir) cmd = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r %s -NGS_or_Cirseq 2 " \ "-rep 2 -q %s" % (d, output_dir, ref_rv, q) pbs_runners.script_runner(cmd, alias="pipeline_d") """Clinical samples""" # make_reference_from_consensus_clinical_samples.py # cycle = "6" # patient_lst = ("Patient_1", "Patient_4", "Patient_5", "Patient_9", "Patient_16", "Patient_17", "Patient_20") # # ref_rv_patient = "/sternadi/home/volume3/okushnir/ref/RVA/JX025555.fasta" # ref_dic = {"Patient_1": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_1_consenX%s.fasta" % cycle, # "Patient_4": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_4_consenX%s.fasta" % cycle, # "Patient_5": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_5_consenX%s.fasta" % cycle, # "Patient_9": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_9_consenX%s.fasta" % cycle, # "Patient_16": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_16_consenX%s.fasta" % cycle, # "Patient_17": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_17_consenX%s.fasta" % cycle, # "Patient_20": "/sternadi/home/volume3/okushnir/ref/RVA/Patient_20_consenX%s.fasta" % cycle} # # for patient in patient_lst: # folders = glob.glob(input_dir_patients + "/%s" % patient) # # print(folders) # # for d in folders: # output_dir = (d + "/20201017_q30_consensusX%s" % cycle) # # print(output_dir) # try: # os.mkdir(output_dir) # except OSError: # print("Creation of the directory %s failed" % output_dir) # else: # print("Successfully created the directory %s " % output_dir) # # cmd = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r %s -NGS_or_Cirseq 2 " \ # "-rep 2 -q 30 -b 40" % (d, output_dir, checkKey(ref_dic, patient)) # pbs_runners.script_runner(cmd, alias="pipeline_d") """5th analyze the freqs""" """6th run variant_caller localy to check context mutations""" # """using /Users/odedkushnir/Google Drive/Studies/PhD/Python_Scripts/variant_caller_github.py (Maoz script) # I added pval to each position according to gamma distribution fit to RNA-Control - args = sample control -c min_coverage -o outpu_file_path # for example: # /Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/capsid/Capsid_32_Ultra/20201012_q38 # /Capsid-32-Ultra.freqs /Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/controls/ # IVT_3_Control/20201012_q38/IVT-3-Control.freqs -c 5000 -o /Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/ # 20201008RV-202329127/merged/capsid/Capsid_32_Ultra/20201012_q38/Capsid-32_UltravsControl.csv""" # # """Using /Users/odedkushnir/Google Drive/Studies/PhD/Python_Scripts/after_variant_caller.py # I merged the freqs files to the variant_caller output file, to create a dataframe with the pval and Prob # Using /Users/odedkushnir/Google Drive/Studies/PhD/Python_Scripts/Context_analysis_RV.py # I created all new q38_data_mutation.csv, q38_data_UpX_by_mutation.csv, q38_data_XpA_by_mutation.csv files with new merged files.""" """Context_analysis_X.py""" """RV""" # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages" # prefix = "/p*" # min_coverage = 5000 # virus = "RVB14" # date = "20201012" # q = "q38" # control_file_rnd = "/Users/odedkushnir/Projects/fitness/AccuNGS/190627_RV_CV/RVB14/RVB14_RNA-Control/q38_3UTR/" \ # "RVB14-RNA-Control.merged.with.mutation.type.freqs" # label_control1 = "RNA Control_RND" # control_file_spe = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/controls/" \ # "IVT_3_Control/20201012_q38/IVT-3-Control.merged.with.mutation.type.freqs" # label_control2 = "RNA Control\nPrimer ID" # control_dict = {label_control1: control_file_rnd, label_control2: control_file_spe} # # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q, control_dict) """RV-Capsid_Free""" # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/capsid" # prefix = "/*_3*" # min_coverage = 5000 # virus = "RVB14" # date = "20201012" # q = "q38" # # control_file_id = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/controls/" \ # "IVT_3_Control/20201012_q38/IVT-3-Control.merged.with.mutation.type.freqs" # label_control1 = "RNA Control\nPrimer ID" # control_file_mix = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages/p8_2/" \ # "20201012_q38/p8-2.merged.with.mutation.type.freqs" # label_control2 = "p8 Mixed Population" # control_dict = {label_control1: control_file_id, label_control2: control_file_mix} # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q, control_dict) # """RV-Patients""" # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/patients" # prefix = "/Patient_*" # min_coverage = 10000 # virus = "RVA" # date = "20201124" # q = "q30" # patient_con = "_consensusX7" # # control_file_id = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/controls/IVT_5_Control/20201012_q38/IVT-5-Control.merged.with.mutation.type.freqs" # label_control1 = "RNA Control\nPrimer ID" # control_file_cell = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/controls/p3_Control/20201012_q38/p3-Control.merged.with.mutation.type.freqs" # label_control2 = "p3 Cell Culture\nControl" # control_dict = {label_control1: control_file_id, label_control2: control_file_cell} # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q, patient_con, control_dict) # """CV""" # input_dir = "/Users/odedkushnir/Projects/fitness/AccuNGS/190627_RV_CV/CVB3" # prefix = "/CVB3_p*" # min_coverage = 5000 # virus = "CVB3" # date = "q38" # q ="3UTR" # # control_file = "/Users/odedkushnir/Projects/fitness/AccuNGS/190627_RV_CV/CVB3/CVB3_RNA_Control/q38_3UTR/" \ # "CVB3-RNA-Control.merged.with.mutation.type.freqs" # label_control = "CVB3-RNA Control" # control_dict = {label_control: control_file} # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q, control_dict) # """PV1""" # input_dir = "/Users/odedkushnir/Projects/fitness/CirSeq/PV/Mahoney" # prefix = "/p*" # min_coverage = 10000 # virus = "Human poliovirus 1" # date = "20181210" # q = "q30" # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q) # """OPV2""" # input_dir = "/Users/odedkushnir/Projects/fitness/CirSeq/PV/OPV" # prefix = "/p*" # min_coverage = 10000 # virus = "OPV" # date = "20190226" # q = "q23" # creating_data_mutation_df(input_dir, prefix, min_coverage, virus, date, q) """AccuNGS_analysis/new_analysis_X.py""" """RV"""