def run_qc():

    dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*'
                          )  #put all  sam files in the list
    reads = []  #number of reads in sam file
    mapped_reads = []  #number of mapped reads in sam file

    #print the  files
    for item in dir_files:
        #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ...
        print item  #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A

        #extract the sample name
        sample_name = item.split("/")[-1]
        print sample_name

        #if sample_name.startswith("Mud_11"): #only want to process if not Nov Mud
        #	continue

        #make a dir with sample name
        #if directory already made then it will go to next entry
        try:
            os.mkdir(sample_name)
        except:
            continue

        #unzip reads with ribo removed
        #gunzip -c /ORG-Data/Wetlands/Metatranscripts/10376.5.159127.TGACCA.anqrpht.fastq.gz > 10376.5.159127.TGACCA.anqrpht.fastq
        unzipped_file = sample_name + "/" + sample_name + ".anqrpht.fastq"
        cmd = "gunzip -c " + item + "/*.anqrpht.fastq.gz > " + unzipped_file
        toolbox.run_system(cmd)
        #unzipped file is Plant_11_14_A/Plant_11_14_A.anqrpht.fastq

        #seperate reads into R1_no_Ribo.fastq and R2_no_Ribo.fastq
        #-these are interleaved forward and reverse reads
        #paste - - - - - - - - < Plant_11_14_A.anqrpht.fastq  | tee >(cut -f 1-4 | tr "\t" "\n" > R1_no_Ribo.fastq)  | cut -f 5-8 | tr "\t" "\n" > R2_no_Ribo.fastq
        #cmd = "paste - - - - - - - - < " + unzipped_file + ' | tee >(cut -f 1-4 | tr "\\t" "\\n" > ' + sample_name + '/R1_no_Ribo.fastq) | cut -f 5-8 | tr "\\t" "\\n" > ' + sample_name + "/R2_no_Ribo.fastq"
        #toolbox.run_system(cmd)
        toolbox.deinterleave_fastq_reads(unzipped_file,
                                         sample_name + "/R1_no_Ribo.fastq",
                                         sample_name + "/R2_no_Ribo.fastq")

        #trim the reads
        cmd = "sickle pe -f " + sample_name + "/R1_no_Ribo.fastq -r " + sample_name + "/R2_no_Ribo.fastq -t sanger -o " + sample_name + "/R1_no_Ribo_trimmed.fastq -p " + sample_name + "/R2_no_Ribo_trimmed.fastq -s " + sample_name + "/R1R2_no_Ribo_trimmed.fastq"
        toolbox.run_system(cmd)

    print ""
    print "Script finished"
    sys.exit(0)
def make_otu_table():

    #combine all the log files
    #note that there should only be the folders in the directory because we created the directory
    cmd = "cat */STEP1_OUT/STEP2_OUT/split_library_log.txt > ALL_split_library_log.txt"
    print "Combining all the log files"
    toolbox.run_system(cmd)

    #cat all the seqs_chimeras_filtered.fna files
    cmd = "cat */STEP1_OUT/STEP2_OUT/seqs_chimeras_filtered.fna  > combined_seqs_chimeras_filtered.fna"
    print "Combining all the seqs_chimeras_filtered.fna files"
    toolbox.run_system(cmd)
    """
	if args.ids: #if id file was supplied then change the prefixes
	 	#note args is a global variable - we can read but not cange it ?
	 	print "args.ids.name = " + args.ids.name

		#make a list of ids and files
		ids = []
		files = []
		id_lines = 0

		line = args.ids.readline()

		while line:
			id_lines += 1
			line = line.rstrip() #remove endline
			cols = line.split() #splits on whitespace
			cols[0] = cols[0].replace("_", "") #remove all underscores
			if cols[0] in ids: #checks for duplicate ids
				print "Error ... id is already in id list = " + cols[0]
				sys.exit(1)
			else:
				ids.append(cols[0]) #id in first col

			#/global/dna/dm_archive/sdm/illumina/01/00/83/10083.1.147588.TTGTCGCACAA.fastq.gz
			#extract the id from the file name
			file_id = cols[1].split(".")[-3]
			if file_id in files:
				print "Error ... file_id is already in list = " + file_id
				sys.exit(1)
			else:
			 files.append(file_id)

			print cols[0] + " " + file_id

			line = args.ids.readline()
		
		args.ids.close
		print "Lines read from id file = ", id_lines

		#read the seqs.fna file and change the ids
		f = open ("combined_seqs_chimeras_filtered.fna", "r")
		out_file = open("NEW_combined_seqs_chimeras_filtered.fna", "w")

		line = f.readline()
		seqs_lines = 0
		seqs_sequences = 0
		while line:
			header,sequence,line,s_lines = toolbox.read_fasta(line,f)
			#Note the header and sequence still have endlines
			seqs_sequences += 1
			seqs_lines += (1 + s_lines) #the header line and the seq lines

			#>ACATATACGCG_0 MISEQ0.....
			header = header[1:] #remove >
			cols = header.split("_",1) #split on 1st _

			index = files.index(cols[0])
			header = ">" + ids[index] + "_" + cols[1]  #the id_0 MISEQ0.....

			out_file.write(header)
			out_file.write(sequence)

		f.close()
		out_file.close()
		print "Sequences in combined_seqs_chimeras_filtered.fna = ", seqs_sequences
		print "Lines in combined_seqs_chimeras_filtered.fna = ",seqs_lines 

		#swap the files
		cmd = "mv NEW_combined_seqs_chimeras_filtered.fna combined_seqs_chimeras_filtered.fna"
		toolbox.run_system(cmd)
	"""

    #make otu table
    pwd = os.getcwd()
    step3_folder = pwd + "/" + args.source + "_STEP3_OUT"
    print "Running pick_open_reference_otus.py"
    cmd = "pick_open_reference_otus.py -i  " + pwd + "/combined_seqs_chimeras_filtered.fna -r /home2/Database/Silva/rep_set/97_Silva_111_rep_set.fasta -o " + step3_folder + " -f -a -O 60"
    #print cmd
    toolbox.run_system(qiime_source + " && " + cmd)

    print "Making otu from biom file"
    cmd = "summarize_taxa.py -i " + step3_folder + "/otu_table_mc2_w_tax.biom -o " + step3_folder + "/taxonomy_summaries/  -L 2,3,4,5,6"
    toolbox.run_system(qiime_source + " && " + cmd)

    #added per lindsey

    cmd = "python /ORG-Data/scripts/wrapper_filter_otus_from_otu_table.py -i " + step3_folder + "/otu_table_mc2_w_tax.biom -o " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.biom -n 10 -p 25"
    toolbox.run_system(qiime_source + " && " + cmd)

    #biom convert -i otu_table_mc2_w_tax.biom -o otu_table_mc2_w_tax.txt --to-tsv --header-key taxonomy
    cmd = "biom convert -i " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.biom -o " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.txt --to-tsv --header-key taxonomy"
    toolbox.run_system(qiime_source + " && " + cmd)

    cmd = "python /ORG-Data/scripts/calculate_relative_abundance.py -i " + step3_folder + "/percent_filtered_otu_table_mc2_w_tax.txt -o " + step3_folder + "/percent_calculate_relative_abundance_output.txt"
    toolbox.run_system(qiime_source + " && " + cmd)

    return
                                     sample_ids[index] + "/reads-2.fq")

    #trim the primers from the reads

    trim_reads(sample_ids[index] + "/reads-1.fq",
               sample_ids[index] + "/reads-2.fq",
               sample_ids[index] + "/trimmed_reads-1.fq", sample_ids[index] +
               "/trimmed_reads-2.fq")  #pass the forward and reverse file

    #join paired ends
    # join_paired_ends.py -f no_primers/reads.fastq -r no_reverse_primers/reads.fastq -o NO_BC_STEP1_OUT/
    cmd = "join_paired_ends.py -f " + sample_ids[
        index] + "/trimmed_reads-1.fq -r " + sample_ids[
            index] + "/trimmed_reads-2.fq -o " + sample_ids[
                index] + "/STEP1_OUT/"
    toolbox.run_system(qiime_source + " && " + cmd)

    #make a mapping file with the id
    #  10279.1.153921.CATCATGAGGC.fastq id = 10279.1.153921.CATCATGAGGC.fastq
    #id = item.split(".")[3]
    f = open(sample_ids[index] + "/" + sample_ids[index] + "_mapping.txt", "w")
    f.write("#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tDescription\n")
    f.write(id + "\t\t\t" + sample_ids[index] + "\n")
    f.close()

    #run split libraries
    #split_libraries_fastq.py -i fastqjoin.join.fastq -o STEP2_OUT/ -m file1_mapping.txt -q 19 --store_demultiplexed_fastq --barcode_type not-barcoded --sample_ids AACAGGTTCGC
    cmd = "split_libraries_fastq.py -i " + sample_ids[
        index] + "/STEP1_OUT/fastqjoin.join.fastq -o " + sample_ids[
            index] + "/STEP1_OUT/STEP2_OUT/ -m " + sample_ids[index] + "/" + sample_ids[
                index] + "_mapping.txt -q 19 --store_demultiplexed_fastq --barcode_type not-barcoded --sample_ids " + sample_ids[
def run_garrett_mud():

    dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*'
                          )  #put all  sam files in the list
    reads = []  #number of reads in sam file
    mapped_reads = []  #number of mapped reads in sam file

    #print the  files
    for item in dir_files:
        #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ...
        print item  #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A

        #extract the sample name
        sample_name = item.split("/")[-1]
        print sample_name

        #if not sample_name.startswith("Mud_11"): #only want to process Nov Mud
        #	continue

        if sample_name.startswith(
                "Mud_11"):  # want to process all except Nov Mud
            continue

        bt_db = "NovMethanotrophBin"

        #bowtie to assembly with multiple align -a option
        cmd = "bowtie2 -D 10 -R 2 -N 1 -L 22 -i S,0,2.50 -a -p 40 -x " + bt_db + " -S " + sample_name + "_mappedto_NovMethanotrophBin.sam -1 ../" + sample_name + "/R1_no_Ribo_trimmed.fastq -2 ../" + sample_name + "/R2_no_Ribo_trimmed.fastq"
        toolbox.run_system(cmd)

        #change reads with mismatches <= 2
        cmd = "python /ORG-Data/scripts/sam_file.py -i " + sample_name + "_mappedto_NovMethanotrophBin.sam -v 2 -o " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.sam"
        toolbox.run_system(cmd)

        #convert to bam
        cmd = "samtools view -@ 60 -bS " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.sam > " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.bam"
        toolbox.run_system(cmd)

        #sort bam
        cmd = "samtools sort -@ 60 " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin.bam " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam"
        toolbox.run_system(cmd)

        #NOTE extra .bam???
        #runn cufflinks
        cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -o " + sample_name + "cufflinks_NovMethanotrophBin " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam.bam"
        toolbox.run_system(cmd)

        #runn cufflinks with corrected for multialign
        cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -u -o " + sample_name + "cufflinks_corrected_NovMethanotrophBin " + sample_name + "_mismatches_2_mappedto_NovMethanotrophBin_SORTED.bam.bam"
        toolbox.run_system(cmd)

    print ""
    print "Script finished"
    sys.exit(0)
def run_database():
    #close the fasta file we only want the name of the file
    args.fasta.close()

    file_name = args.fasta.name.split("/")[-1]  #if path get filename

    print "file_name = ", file_name

    #make a bowtie database with the fasta file
    if args.skip_bowtie == "F":
        print "Making bowtie index"
        cmd = "bowtie2-build " + args.fasta.name + " " + file_name
        toolbox.run_system(cmd)

    #dir_files = glob.glob('/ORG-Data/Wetlands/Metatranscripts/*')   #put all  sam files in the list
    dir_files = []
    dir_files.append('Plant_11_14_A')
    dir_files.append('Plant_11_14_B')
    dir_files.append('Plant_11_14_C')
    dir_files.append('Mud_11_14_A')
    dir_files.append('Mud_11_14_B')
    dir_files.append('Mud_11_14_C')
    dir_files.append('Plant_9_15_A')
    dir_files.append('Plant_9_15_B')
    dir_files.append('Plant_9_15_C')
    dir_files.append('Mud_9_15_A')
    dir_files.append('Mud_9_15_B')
    dir_files.append('Mud_9_15_C')

    reads = []  #number of reads in sam file
    mapped_reads = []  #number of mapped reads in sam file

    #print the  files
    for sample_name in dir_files:
        #there are 3 Plant 11 samples, 3 plant 9, 3Mud 11, 3 Mud ...
        print sample_name  #Ex: /ORG-Data/Wetlands/Metatranscripts/Plant_11_14_A

        #extract the sample name
        #sample_name = item.split("/")[-1]
        #sample_name = item
        #print sample_name

        #if not sample_name.startswith("Mud_11"): #only want to process Nov Mud
        #	continue

        #if sample_name.startswith("Mud_11"): #want to process all exceptNov Mud
        #	continue

        #see if already processed
        #dont_do_list = ["Mud_11_14_C","Mud_9_15_A","Mud_9_15_C","Plant_11_14_A","Plant_11_14_B","Plant_9_15_A","Plant_9_15_B"]

        #if sample_name in dont_do_list:
        #	continue

        bt_db = file_name

        sam_file = sample_name + "_mappedto_" + bt_db + ".sam"
        #r1 = "../" + sample_name + "/R1_no_Ribo_trimmed.fastq"
        #r2 = "../" + sample_name + "/R2_no_Ribo_trimmed.fastq"
        r1 = "/home2/projects/Wetlands/Metatranscripts/" + sample_name + "/R1_no_Ribo_trimmed.fastq"
        r2 = "/home2/projects/Wetlands/Metatranscripts/" + sample_name + "/R2_no_Ribo_trimmed.fastq"
        mis_match_file = "mismatches_" + str(args.mismatches) + "_" + sam_file
        bam_file = mis_match_file + ".bam"
        sorted_bam_file = "SORTED_" + bam_file
        cufflinks_dir = sample_name + "_cufflinks_" + bt_db + "_mis_" + str(
            args.mismatches)
        corrected_cufflinks_dir = sample_name + "_cufflinks_corrected_" + bt_db + "_mis_" + str(
            args.mismatches)

        #bowtie to assembly with multiple align -a option
        if args.skip_bowtie == "F":
            cmd = "bowtie2 -D 10 -R 2 -N 1 -L 22 -i S,0,2.50 -a -p 20 -x " + bt_db + " -S " + sam_file + " -1 " + r1 + " -2 " + r2
            toolbox.run_system(cmd)

        #change reads with mismatches <= 2
        cmd = "python /ORG-Data/scripts/sam_file.py -i " + sam_file + " -v " + str(
            args.mismatches) + " -o " + mis_match_file
        toolbox.run_system(cmd)

        #convert to bam
        cmd = "samtools view -@ 20 -bS " + mis_match_file + " > " + bam_file
        toolbox.run_system(cmd)

        #sort bam
        cmd = "samtools sort -@ 20 " + bam_file + " " + sorted_bam_file
        toolbox.run_system(cmd)

        #NOTE extra .bam???
        #runn cufflinks
        sorted_bam_file = sorted_bam_file + ".bam"
        cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -o " + cufflinks_dir + " " + sorted_bam_file
        toolbox.run_system(cmd)

        #runn cufflinks with corrected for multialign
        cmd = "/home2/opt/Cufflinks/cufflinks-2.2.1.Linux_x86_64/cufflinks -u -o " + corrected_cufflinks_dir + " " + sorted_bam_file
        toolbox.run_system(cmd)

    print ""
    print "Script finished"
    sys.exit(0)
    #extract the sample name
    sample_name = item.split("/")[-1]
    print sample_name

    if not sample_name.startswith("Mud_11"):  #only want to process Nov Mud
        continue

    #make a dir with sample name
    os.mkdir(sample_name)

    #unzip reads with ribo removed
    #gunzip -c /ORG-Data/Wetlands/Metatranscripts/10376.5.159127.TGACCA.anqrpht.fastq.gz > 10376.5.159127.TGACCA.anqrpht.fastq
    unzipped_file = sample_name + "/" + sample_name + ".anqrpht.fastq"
    cmd = "gunzip -c " + item + "/*.anqrpht.fastq.gz > " + unzipped_file
    toolbox.run_system(cmd)
    #unzipped file is Plant_11_14_A/Plant_11_14_A.anqrpht.fastq

    #seperate reads into R1_no_Ribo.fastq and R2_no_Ribo.fastq
    #-these are interleaved forward and reverse reads
    #paste - - - - - - - - < Plant_11_14_A.anqrpht.fastq  | tee >(cut -f 1-4 | tr "\t" "\n" > R1_no_Ribo.fastq)  | cut -f 5-8 | tr "\t" "\n" > R2_no_Ribo.fastq
    #cmd = "paste - - - - - - - - < " + unzipped_file + ' | tee >(cut -f 1-4 | tr "\\t" "\\n" > ' + sample_name + '/R1_no_Ribo.fastq) | cut -f 5-8 | tr "\\t" "\\n" > ' + sample_name + "/R2_no_Ribo.fastq"
    #toolbox.run_system(cmd)
    toolbox.deinterleave_fastq_reads(unzipped_file,
                                     sample_name + "/R1_no_Ribo.fastq",
                                     sample_name + "/R2_no_Ribo.fastq")

    #trim the reads
    cmd = "sickle pe -f " + sample_name + "/R1_no_Ribo.fastq -r " + sample_name + "/R2_no_Ribo.fastq -t sanger -o " + sample_name + "/R1_no_Ribo_trimmed.fastq -p " + sample_name + "/R2_no_Ribo_trimmed.fastq -s " + sample_name + "/R1R2_no_Ribo_trimmed.fastq"
    toolbox.run_system(cmd)
Beispiel #7
0
def make_mismatch_table():
    #make a mismatch table for all sam files in this directory

    #make a list of all the .sam files in this folder
    sam_files = []
    mismatches = []
    counts = []  #this will be a list of lists for each sam file

    line = args.sam_list.readline()
    while line:
        line = line.rstrip()
        sam_files.append(line)

        line = args.sam_list.readline()

    for item in sam_files:
        print item

        #count the mismatches in this file
        cmd = "python /ORG-Data/scripts/sam_file.py -i " + item + " -c T"
        toolbox.run_system(cmd)

        #results will be in the file args.input.name + _mismatches.txt
        f = open(item + "_mismatches.txt", "rU")

        temp = []
        for i in mismatches:
            temp.append("0")

        line = f.readline()
        while line:
            line = line.rstrip()

            if line.split()[0] in mismatches:  #if XM:i:11 in list
                index = mismatches.index(line.split()[0])
                temp[index] = line.split()[1]
            else:
                mismatches.append(line.split()[0])
                for i in counts:
                    i.append("0")

                temp.append(line.split()[1])

            line = f.readline()

        counts.append(temp)
        f.close()

        #sys.exit(0)

    #print table to output file
    args.make_mismatch_table.write("mismatches")
    for i in sam_files:
        args.make_mismatch_table.write("\t" + i)
    args.make_mismatch_table.write("\n")

    i = 0
    while i < len(mismatches):
        args.make_mismatch_table.write(mismatches[i])
        for item in counts:
            args.make_mismatch_table.write("\t" + item[i])
        args.make_mismatch_table.write("\n")

        i += 1

    print ""
    print "Number of sam file in this directory = ", len(sam_files)
    print ""
    print "Script finished..."

    sys.exit(0)