def run_qc(): dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*' ) #put all sam files in the list reads = [] #number of reads in sam file mapped_reads = [] #number of mapped reads in sam file #print the files for item in dir_files: #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ... print item #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A #extract the sample name sample_name = item.split("/")[-1] print sample_name #if sample_name.startswith("Mud_11"): #only want to process if not Nov Mud # continue #make a dir with sample name #if directory already made then it will go to next entry try: os.mkdir(sample_name) except: continue #unzip reads with ribo removed #gunzip -c /ORG-Data/Wetlands/Metatranscripts/10376.5.159127.TGACCA.anqrpht.fastq.gz > 10376.5.159127.TGACCA.anqrpht.fastq unzipped_file = sample_name + "/" + sample_name + ".anqrpht.fastq" cmd = "gunzip -c " + item + "/*.anqrpht.fastq.gz > " + unzipped_file toolbox.run_system(cmd) #unzipped file is Plant_11_14_A/Plant_11_14_A.anqrpht.fastq #seperate reads into R1_no_Ribo.fastq and R2_no_Ribo.fastq #-these are interleaved forward and reverse reads #paste - - - - - - - - < Plant_11_14_A.anqrpht.fastq | tee >(cut -f 1-4 | tr "\t" "\n" > R1_no_Ribo.fastq) | cut -f 5-8 | tr "\t" "\n" > R2_no_Ribo.fastq #cmd = "paste - - - - - - - - < " + unzipped_file + ' | tee >(cut -f 1-4 | tr "\\t" "\\n" > ' + sample_name + '/R1_no_Ribo.fastq) | cut -f 5-8 | tr "\\t" "\\n" > ' + sample_name + "/R2_no_Ribo.fastq" #toolbox.run_system(cmd) toolbox.deinterleave_fastq_reads(unzipped_file, sample_name + "/R1_no_Ribo.fastq", sample_name + "/R2_no_Ribo.fastq") #trim the reads cmd = "sickle pe -f " + sample_name + "/R1_no_Ribo.fastq -r " + sample_name + "/R2_no_Ribo.fastq -t sanger -o " + sample_name + "/R1_no_Ribo_trimmed.fastq -p " + sample_name + "/R2_no_Ribo_trimmed.fastq -s " + sample_name + "/R1R2_no_Ribo_trimmed.fastq" toolbox.run_system(cmd) print "" print "Script finished" sys.exit(0)
def make_otu_table(): #combine all the log files #note that there should only be the folders in the directory because we created the directory cmd = "cat */STEP1_OUT/STEP2_OUT/split_library_log.txt > ALL_split_library_log.txt" print "Combining all the log files" toolbox.run_system(cmd) #cat all the seqs_chimeras_filtered.fna files cmd = "cat */STEP1_OUT/STEP2_OUT/seqs_chimeras_filtered.fna > combined_seqs_chimeras_filtered.fna" print "Combining all the seqs_chimeras_filtered.fna files" toolbox.run_system(cmd) """ if args.ids: #if id file was supplied then change the prefixes #note args is a global variable - we can read but not cange it ? print "args.ids.name = " + args.ids.name #make a list of ids and files ids = [] files = [] id_lines = 0 line = args.ids.readline() while line: id_lines += 1 line = line.rstrip() #remove endline cols = line.split() #splits on whitespace cols[0] = cols[0].replace("_", "") #remove all underscores if cols[0] in ids: #checks for duplicate ids print "Error ... id is already in id list = " + cols[0] sys.exit(1) else: ids.append(cols[0]) #id in first col #/global/dna/dm_archive/sdm/illumina/01/00/83/10083.1.147588.TTGTCGCACAA.fastq.gz #extract the id from the file name file_id = cols[1].split(".")[-3] if file_id in files: print "Error ... file_id is already in list = " + file_id sys.exit(1) else: files.append(file_id) print cols[0] + " " + file_id line = args.ids.readline() args.ids.close print "Lines read from id file = ", id_lines #read the seqs.fna file and change the ids f = open ("combined_seqs_chimeras_filtered.fna", "r") out_file = open("NEW_combined_seqs_chimeras_filtered.fna", "w") line = f.readline() seqs_lines = 0 seqs_sequences = 0 while line: header,sequence,line,s_lines = toolbox.read_fasta(line,f) #Note the header and sequence still have endlines seqs_sequences += 1 seqs_lines += (1 + s_lines) #the header line and the seq lines #>ACATATACGCG_0 MISEQ0..... header = header[1:] #remove > cols = header.split("_",1) #split on 1st _ index = files.index(cols[0]) header = ">" + ids[index] + "_" + cols[1] #the id_0 MISEQ0..... out_file.write(header) out_file.write(sequence) f.close() out_file.close() print "Sequences in combined_seqs_chimeras_filtered.fna = ", seqs_sequences print "Lines in combined_seqs_chimeras_filtered.fna = ",seqs_lines #swap the files cmd = "mv NEW_combined_seqs_chimeras_filtered.fna combined_seqs_chimeras_filtered.fna" toolbox.run_system(cmd) """ #make otu table pwd = os.getcwd() step3_folder = pwd + "/" + args.source + "_STEP3_OUT" print "Running pick_open_reference_otus.py" cmd = "pick_open_reference_otus.py -i " + pwd + "/combined_seqs_chimeras_filtered.fna -r /home2/Database/Silva/rep_set/97_Silva_111_rep_set.fasta -o " + step3_folder + " -f -a -O 60" #print cmd toolbox.run_system(qiime_source + " && " + cmd) print "Making otu from biom file" cmd = "summarize_taxa.py -i " + step3_folder + "/otu_table_mc2_w_tax.biom -o " + step3_folder + "/taxonomy_summaries/ -L 2,3,4,5,6" toolbox.run_system(qiime_source + " && " + cmd) #added per lindsey cmd = "python /ORG-Data/scripts/wrapper_filter_otus_from_otu_table.py -i " + step3_folder + "/otu_table_mc2_w_tax.biom -o " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.biom -n 10 -p 25" toolbox.run_system(qiime_source + " && " + cmd) #biom convert -i otu_table_mc2_w_tax.biom -o otu_table_mc2_w_tax.txt --to-tsv --header-key taxonomy cmd = "biom convert -i " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.biom -o " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.txt --to-tsv --header-key taxonomy" toolbox.run_system(qiime_source + " && " + cmd) cmd = "python /ORG-Data/scripts/calculate_relative_abundance.py -i " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.txt -o " + step3_folder + "/percent_calculate_relative_abundance_output.txt" toolbox.run_system(qiime_source + " && " + cmd) return
sample_ids[index] + "/reads-2.fq") #trim the primers from the reads trim_reads(sample_ids[index] + "/reads-1.fq", sample_ids[index] + "/reads-2.fq", sample_ids[index] + "/trimmed_reads-1.fq", sample_ids[index] + "/trimmed_reads-2.fq") #pass the forward and reverse file #join paired ends # join_paired_ends.py -f no_primers/reads.fastq -r no_reverse_primers/reads.fastq -o NO_BC_STEP1_OUT/ cmd = "join_paired_ends.py -f " + sample_ids[ index] + "/trimmed_reads-1.fq -r " + sample_ids[ index] + "/trimmed_reads-2.fq -o " + sample_ids[ index] + "/STEP1_OUT/" toolbox.run_system(qiime_source + " && " + cmd) #make a mapping file with the id # 10279.1.153921.CATCATGAGGC.fastq id = 10279.1.153921.CATCATGAGGC.fastq #id = item.split(".")[3] f = open(sample_ids[index] + "/" + sample_ids[index] + "_mapping.txt", "w") f.write("#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tDescription\n") f.write(id + "\t\t\t" + sample_ids[index] + "\n") f.close() #run split libraries #split_libraries_fastq.py -i fastqjoin.join.fastq -o STEP2_OUT/ -m file1_mapping.txt -q 19 --store_demultiplexed_fastq --barcode_type not-barcoded --sample_ids AACAGGTTCGC cmd = "split_libraries_fastq.py -i " + sample_ids[ index] + "/STEP1_OUT/fastqjoin.join.fastq -o " + sample_ids[ index] + "/STEP1_OUT/STEP2_OUT/ -m " + sample_ids[index] + "/" + sample_ids[ index] + "_mapping.txt -q 19 --store_demultiplexed_fastq --barcode_type not-barcoded --sample_ids " + sample_ids[
def run_garrett_mud(): dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*' ) #put all sam files in the list reads = [] #number of reads in sam file mapped_reads = [] #number of mapped reads in sam file #print the files for item in dir_files: #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ... print item #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A #extract the sample name sample_name = item.split("/")[-1] print sample_name #if not sample_name.startswith("Mud_11"): #only want to process Nov Mud # continue if sample_name.startswith( "Mud_11"): # want to process all except Nov Mud continue bt_db = "NovMethanotrophBin" #bowtie to assembly with multiple align -a option cmd = "bowtie2 -D 10 -R 2 -N 1 -L 22 -i S,0,2.50 -a -p 40 -x " + bt_db + " -S " + sample_name + "_mappedto_NovMethanotrophBin.sam -1 ../" + sample_name + "/R1_no_Ribo_trimmed.fastq -2 ../" + sample_name + "/R2_no_Ribo_trimmed.fastq" toolbox.run_system(cmd) #change reads with mismatches <= 2 cmd = "python /ORG-Data/scripts/sam_file.py -i " + sample_name + "_mappedto_NovMethanotrophBin.sam -v 2 -o " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.sam" toolbox.run_system(cmd) #convert to bam cmd = "samtools view -@ 60 -bS " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.sam > " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.bam" toolbox.run_system(cmd) #sort bam cmd = "samtools sort -@ 60 " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.bam " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam" toolbox.run_system(cmd) #NOTE extra .bam??? #runn cufflinks cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -o " + sample_name + "cufflinks_NovMethanotrophBin " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam.bam" toolbox.run_system(cmd) #runn cufflinks with corrected for multialign cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -u -o " + sample_name + "cufflinks_corrected_NovMethanotrophBin " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam.bam" toolbox.run_system(cmd) print "" print "Script finished" sys.exit(0)
def run_database(): #close the fasta file we only want the name of the file args.fasta.close() file_name = args.fasta.name.split("/")[-1] #if path get filename print "file_name = ", file_name #make a bowtie database with the fasta file if args.skip_bowtie == "F": print "Making bowtie index" cmd = "bowtie2-build " + args.fasta.name + " " + file_name toolbox.run_system(cmd) #dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*') #put all sam files in the list dir_files = [] dir_files.append('Plant_11_14_A') dir_files.append('Plant_11_14_B') dir_files.append('Plant_11_14_C') dir_files.append('Mud_11_14_A') dir_files.append('Mud_11_14_B') dir_files.append('Mud_11_14_C') dir_files.append('Plant_9_15_A') dir_files.append('Plant_9_15_B') dir_files.append('Plant_9_15_C') dir_files.append('Mud_9_15_A') dir_files.append('Mud_9_15_B') dir_files.append('Mud_9_15_C') reads = [] #number of reads in sam file mapped_reads = [] #number of mapped reads in sam file #print the files for sample_name in dir_files: #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ... print sample_name #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A #extract the sample name #sample_name = item.split("/")[-1] #sample_name = item #print sample_name #if not sample_name.startswith("Mud_11"): #only want to process Nov Mud # continue #if sample_name.startswith("Mud_11"): #want to process all exceptNov Mud # continue #see if already processed #dont_do_list = ["Mud_11_14_C","Mud_9_15_A","Mud_9_15_C","Plant_11_14_A","Plant_11_14_B","Plant_9_15_A","Plant_9_15_B"] #if sample_name in dont_do_list: # continue bt_db = file_name sam_file = sample_name + "_mappedto_" + bt_db + ".sam" #r1 = "../" + sample_name + "/R1_no_Ribo_trimmed.fastq" #r2 = "../" + sample_name + "/R2_no_Ribo_trimmed.fastq" r1 = "/home2/projects/Wetlands/Metatranscripts/" + sample_name + "/R1_no_Ribo_trimmed.fastq" r2 = "/home2/projects/Wetlands/Metatranscripts/" + sample_name + "/R2_no_Ribo_trimmed.fastq" mis_match_file = "mismatches_" + str(args.mismatches) + "_" + sam_file bam_file = mis_match_file + ".bam" sorted_bam_file = "SORTED_" + bam_file cufflinks_dir = sample_name + "_cufflinks_" + bt_db + "_mis_" + str( args.mismatches) corrected_cufflinks_dir = sample_name + "_cufflinks_corrected_" + bt_db + "_mis_" + str( args.mismatches) #bowtie to assembly with multiple align -a option if args.skip_bowtie == "F": cmd = "bowtie2 -D 10 -R 2 -N 1 -L 22 -i S,0,2.50 -a -p 20 -x " + bt_db + " -S " + sam_file + " -1 " + r1 + " -2 " + r2 toolbox.run_system(cmd) #change reads with mismatches <= 2 cmd = "python /ORG-Data/scripts/sam_file.py -i " + sam_file + " -v " + str( args.mismatches) + " -o " + mis_match_file toolbox.run_system(cmd) #convert to bam cmd = "samtools view -@ 20 -bS " + mis_match_file + " > " + bam_file toolbox.run_system(cmd) #sort bam cmd = "samtools sort -@ 20 " + bam_file + " " + sorted_bam_file toolbox.run_system(cmd) #NOTE extra .bam??? #runn cufflinks sorted_bam_file = sorted_bam_file + ".bam" cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -o " + cufflinks_dir + " " + sorted_bam_file toolbox.run_system(cmd) #runn cufflinks with corrected for multialign cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -u -o " + corrected_cufflinks_dir + " " + sorted_bam_file toolbox.run_system(cmd) print "" print "Script finished" sys.exit(0)
#extract the sample name sample_name = item.split("/")[-1] print sample_name if not sample_name.startswith("Mud_11"): #only want to process Nov Mud continue #make a dir with sample name os.mkdir(sample_name) #unzip reads with ribo removed #gunzip -c /ORG-Data/Wetlands/Metatranscripts/10376.5.159127.TGACCA.anqrpht.fastq.gz > 10376.5.159127.TGACCA.anqrpht.fastq unzipped_file = sample_name + "/" + sample_name + ".anqrpht.fastq" cmd = "gunzip -c " + item + "/*.anqrpht.fastq.gz > " + unzipped_file toolbox.run_system(cmd) #unzipped file is Plant_11_14_A/Plant_11_14_A.anqrpht.fastq #seperate reads into R1_no_Ribo.fastq and R2_no_Ribo.fastq #-these are interleaved forward and reverse reads #paste - - - - - - - - < Plant_11_14_A.anqrpht.fastq | tee >(cut -f 1-4 | tr "\t" "\n" > R1_no_Ribo.fastq) | cut -f 5-8 | tr "\t" "\n" > R2_no_Ribo.fastq #cmd = "paste - - - - - - - - < " + unzipped_file + ' | tee >(cut -f 1-4 | tr "\\t" "\\n" > ' + sample_name + '/R1_no_Ribo.fastq) | cut -f 5-8 | tr "\\t" "\\n" > ' + sample_name + "/R2_no_Ribo.fastq" #toolbox.run_system(cmd) toolbox.deinterleave_fastq_reads(unzipped_file, sample_name + "/R1_no_Ribo.fastq", sample_name + "/R2_no_Ribo.fastq") #trim the reads cmd = "sickle pe -f " + sample_name + "/R1_no_Ribo.fastq -r " + sample_name + "/R2_no_Ribo.fastq -t sanger -o " + sample_name + "/R1_no_Ribo_trimmed.fastq -p " + sample_name + "/R2_no_Ribo_trimmed.fastq -s " + sample_name + "/R1R2_no_Ribo_trimmed.fastq" toolbox.run_system(cmd)
def make_mismatch_table(): #make a mismatch table for all sam files in this directory #make a list of all the .sam files in this folder sam_files = [] mismatches = [] counts = [] #this will be a list of lists for each sam file line = args.sam_list.readline() while line: line = line.rstrip() sam_files.append(line) line = args.sam_list.readline() for item in sam_files: print item #count the mismatches in this file cmd = "python /ORG-Data/scripts/sam_file.py -i " + item + " -c T" toolbox.run_system(cmd) #results will be in the file args.input.name + _mismatches.txt f = open(item + "_mismatches.txt", "rU") temp = [] for i in mismatches: temp.append("0") line = f.readline() while line: line = line.rstrip() if line.split()[0] in mismatches: #if XM:i:11 in list index = mismatches.index(line.split()[0]) temp[index] = line.split()[1] else: mismatches.append(line.split()[0]) for i in counts: i.append("0") temp.append(line.split()[1]) line = f.readline() counts.append(temp) f.close() #sys.exit(0) #print table to output file args.make_mismatch_table.write("mismatches") for i in sam_files: args.make_mismatch_table.write("\t" + i) args.make_mismatch_table.write("\n") i = 0 while i < len(mismatches): args.make_mismatch_table.write(mismatches[i]) for item in counts: args.make_mismatch_table.write("\t" + item[i]) args.make_mismatch_table.write("\n") i += 1 print "" print "Number of sam file in this directory = ", len(sam_files) print "" print "Script finished..." sys.exit(0)