def get_picard_string(basedir, bam_file, aligned_files_dir, base_filename):
    metrics_dir = aligned_files_dir + "picard_metrics/"
    cluster.check_dir(metrics_dir)
    sge_filename = basedir + "sge_files/picard_" + base_filename + ".sge"
    picard_string1 = 'PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"'
    picard_string2 = "java -Xmx16G -jar ${PICARD_ROOT}/MarkDuplicates.jar $PICARD_SETTINGS REMOVE_DUPLICATES=true ASSUME_SORTED=false CREATE_INDEX=true INPUT=" + bam_file + " OUTPUT=" + aligned_files_dir + base_filename + ".dedup.bam METRICS_FILE=" + metrics_dir + base_filename + ".dups.txt"
    return picard_string1, picard_string2
def get_picard_string(basedir,bam_file,aligned_files_dir,base_filename):
    metrics_dir=aligned_files_dir+"picard_metrics/"
    cluster.check_dir(metrics_dir)
    sge_filename=basedir+"sge_files/picard_"+base_filename+".sge"
    picard_string1='PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"'
    picard_string2="java -Xmx16G -jar ${PICARD_ROOT}/MarkDuplicates.jar $PICARD_SETTINGS REMOVE_DUPLICATES=true ASSUME_SORTED=false CREATE_INDEX=true INPUT="+bam_file+" OUTPUT="+aligned_files_dir+base_filename+".dedup.bam METRICS_FILE="+metrics_dir+base_filename+".dups.txt"
    return picard_string1,picard_string2
Esempio n. 3
0
def get_htseqcount_string(basedir,sorted_bam_filename,gtf_file,sample):
    htseq_counts_dir=basedir+"htseq_counts_third/"
    cluster.check_dir(htseq_counts_dir)
    # use this with gtf file:
    htseq_string="htseq-count --stranded=no --format=bam "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt"
    # use this with gff file:
    #htseq_string="htseq-count --stranded=no --format=bam --idattr=ID --type=gene "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt"
    return htseq_string
Esempio n. 4
0
def get_htseqcount_string(basedir, sorted_bam_filename, gtf_file, sample):
    htseq_counts_dir = basedir + "htseq_counts_third/"
    cluster.check_dir(htseq_counts_dir)
    # use this with gtf file:
    htseq_string = "htseq-count --stranded=no --format=bam " + sorted_bam_filename + " " + gtf_file + " > " + htseq_counts_dir + sample + "_counts.txt"
    # use this with gff file:
    #htseq_string="htseq-count --stranded=no --format=bam --idattr=ID --type=gene "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt"
    return htseq_string
def tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files_dictionary,files_dir,annotation,reference,reference_fasta):
    threads="8"
    process_name="tophat"
    #module_list=["cufflinks/2.2.0"]
    module_list=["bowtie2/2.1.0","tophat/2.0.9","cufflinks/2.2.0","samtools","picard-tools/1.88"]
    # makes a bunch of directories for file output:
    #tuxedo_files_dir=basedir+"tuxedo_4_Steve_gtf/"
    #tuxedo_files_dir=basedir+"tuxedo_3_UCSC/"
    #tuxedo_files_dir=basedir+"tuxedo_2/"
    tuxedo_files_dir=basedir+"tuxedo/"
    cluster.check_dir(tuxedo_files_dir)
    #print tuxedo_files_dir
    tophat_dir=tuxedo_files_dir+"tophat/"
    cluster.check_dir(tophat_dir)
    #print tophat_dir
    cufflinks_dir=tuxedo_files_dir+"cufflinks/"
    cluster.check_dir(cufflinks_dir)
    #print cufflinks_dir
    transcripts_output={}
    duplicates_removed_bam_files={}
    for sample in files_dictionary.keys():
        # creates a dictionary that will be populated with cxb files
        file_list=merged_file_dictionary[sample]
        fastq_filename1=file_list[0]
        fastq_filename2=file_list[1]
        #print file_list
        base_filename=files.get_base_filename(fastq_filename1)
        #print base_filename
        # Separate directories have to be made in each process dir
        # because the output files are all genericaly named, e.g. transcripts.gtf, accepted_hits.bam, etc.
        
        # start tuxedo pipeline:
        # tophat --> remove_duplicates (2 picardtools strings) --> cufflinks --> cuffcompare --> cuffmerge --> cuffquant --> cuffdiff
        # 1. tophat
        tophat_outputdir=tophat_dir+base_filename+"/"
        cluster.check_dir(tophat_outputdir)
        #print tophat_outputdir
        fastq_files_string=files.get_file_string(file_list)
        #print_string[1:]
        tophat_string=tuxedo.get_tophat_string_annotation(reference,annotation,fastq_files_string[1:],tophat_outputdir)
        tophat_output=tuxedo.get_tophat_output(tophat_outputdir)
        # 2. Remove duplicates with picardtools
        picard_string1,picard_string2=samtools_picard.get_picard_string(basedir,tophat_output,tophat_outputdir,base_filename)
        duplicates_removed_bam=tuxedo.get_dupremoved_output(base_filename,tophat_outputdir)
        # 3. cufflinks
        cufflinks_outputdir=cufflinks_dir+base_filename+"/"
        cluster.check_dir(cufflinks_outputdir)
        cufflinks_output=cufflinks_outputdir+"transcripts.gtf"
        transcripts_output[sample]=cufflinks_output
        duplicates_removed_bam_files[sample]=duplicates_removed_bam
        cufflinks_string=tuxedo.get_cufflinks_string(cufflinks_outputdir,duplicates_removed_bam)
        #print cufflinks_output
        #print os.path.isfile(cufflinks_output)
        process_list=[tophat_string,picard_string1,picard_string2,cufflinks_string]
        #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads)
    #return transcripts_output
    return duplicates_removed_bam_files
def run_cuffdiff(basedir,cxb_output,reference_fasta,cuffmerge_file):
    cuffdiff_outputdir=basedir+"tuxedo/cuffdiff/"
    cluster.check_dir(cuffdiff_outputdir)
    Mutant,Control=get_lists_of_files_by_group(cxb_output)
    labels="Mutant,Control"
    groupsoffiles=groups_files(Mutant,Control)
    cuffdiff_string=tuxedo.get_cuffdiff_string(basedir,cuffdiff_outputdir,groupsoffiles,reference_fasta,labels,cuffmerge_file)
    threads="32"
    process_name="cuffdiff"
    module_list=["cufflinks/2.2.0"]
    base_filename="dasen_MutantvsControl"
    process_list=[cuffdiff_string]
def tuxedo_pipeline_cuffmerge(basedir,annotation,reference,reference_fasta,assembly_filename):
    threads="8"
    process_name="cuffmerge"
    module_list=["cufflinks/2.2.0"]
    base_filename="dasen2"
    cuffmerge_dir=basedir+"tuxedo/cufflinks/merged_asm/"
    cluster.check_dir(cuffmerge_dir)
    #print cuffmerge_dir
    # 5. cuffmerge - THIS HAS TO BE RUN AFTER CUFFLINKS AS A SEPARATE STEP, JUST ONCE!!!
    # produces a GTF file that contains all merged assemblies
    cuffmerge_string=tuxedo.get_cuffmerge_string(reference_fasta,annotation,assembly_filename,cuffmerge_dir)
    cuffmerge_output=cuffmerge_dir+"merge/merged.gtf"
    process_list=[cuffmerge_string]
    #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads)
    return cuffmerge_output
def get_RnaSeqMetrics(basedir,bam_file,aligned_files_dir,base_filename):
    # picard RnaSeqMetrics
    # http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics
    metrics_dir=aligned_files_dir+"picard_metrics/"
    cluster.check_dir(metrics_dir)
    metrics_txt=metrics_dir+base_filename+".txt"
    metrics_pdf=metrics_dir+base_filename+".pdf"
    process_list=[]
    # settings
    process_list.append('REFFLAT="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/refFlat.txt.gz"')
    process_list.append('PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"')
    process_list.append("java -Xmx16G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \\")
    process_list.append("$PICARD_SETTINGS \\")
    process_list.append("REF_FLAT=${REFFLAT} \\")
    process_list.append("STRAND_SPECIFICITY=NONE \\")
    process_list.append("INPUT="+bam_file+" \\")
    process_list.append("CHART_OUTPUT="+metrics_pdf+" \\")
    process_list.append("OUTPUT="+metrics_txt)	
    return process_list
def get_RnaSeqMetrics(basedir, bam_file, aligned_files_dir, base_filename):
    # picard RnaSeqMetrics
    # http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics
    metrics_dir = aligned_files_dir + "picard_metrics/"
    cluster.check_dir(metrics_dir)
    metrics_txt = metrics_dir + base_filename + ".txt"
    metrics_pdf = metrics_dir + base_filename + ".pdf"
    process_list = []
    # settings
    process_list.append(
        'REFFLAT="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/refFlat.txt.gz"'
    )
    process_list.append(
        'PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"'
    )
    process_list.append(
        "java -Xmx16G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \\")
    process_list.append("$PICARD_SETTINGS \\")
    process_list.append("REF_FLAT=${REFFLAT} \\")
    process_list.append("STRAND_SPECIFICITY=NONE \\")
    process_list.append("INPUT=" + bam_file + " \\")
    process_list.append("CHART_OUTPUT=" + metrics_pdf + " \\")
    process_list.append("OUTPUT=" + metrics_txt)
    return process_list
    with open(alignment_table_filename,"w") as datafile:
        datafile.write("\t".join(header))
        datafile.write("\n")
        for sample in sample_data_dictionary:
            print "Sample:",sample
            filename=sample_data_dictionary[sample]
            data_list=tuxedo.get_bowtie1_alignment_data(filename)
            print data_list
            duplicates_removed=get_duplicates_removed(aligned_files_dir,sample)
            datafile.write(sample+"\t")
            datafile.write(duplicates_removed+"\t")
            datafile.write("\t".join(data_list))
            datafile.write("\n")
    datafile.close()
    print "Alignment stats written:",alignment_table_filename

filesdir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/fastq/"
basedir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/htseq/"
reference="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/BowtieIndex/genome"
annotation="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf"
aligned_files_dir=basedir+"bowtie1_aligned_files/"
cluster.check_dir(aligned_files_dir)
fileslist_all=os.listdir(filesdir)
fileslist=get_fileslist(fileslist_all,filesdir)
print fileslist
#gunzip_files_list=gunzip_files(basedir,fileslist)
files_dictionary=get_file_dictionaries(fileslist)
print files_dictionary
run_bowtie1_htseq(files_dictionary,annotation,reference,basedir,aligned_files_dir)
#get_alignment_data(get_sample_data(),aligned_files_dir)
    with open(alignment_table_filename,"w") as datafile:
        datafile.write("\t".join(header))
        datafile.write("\n")
        for sample in sample_data_dictionary:
            print "Sample:",sample
            filename=sample_data_dictionary[sample]
            data_list=tuxedo.get_bowtie1_alignment_data(filename)
            print data_list
            duplicates_removed=get_duplicates_removed(aligned_files_dir,sample)
            datafile.write(sample+"\t")
            datafile.write(duplicates_removed+"\t")
            datafile.write("\t".join(data_list))
            datafile.write("\n")
    datafile.close()
    print "Alignment stats written:",alignment_table_filename

filesdir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/fastq/"
basedir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/htseq/"
reference="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/BowtieIndex/genome"
annotation="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf"
aligned_files_dir=basedir+"bowtie1_aligned_files/"
cluster.check_dir(aligned_files_dir)
fileslist_all=os.listdir(filesdir)
fileslist=get_fileslist(fileslist_all)
print fileslist
gunzip_files_list=gunzip_files(basedir,fileslist)
files_dictionary=get_file_dictionaries(gunzip_files_list)
print files_dictionary
run_bowtie1_htseq(files_dictionary,annotation,reference,basedir,aligned_files_dir)
#get_alignment_data(get_sample_data(),aligned_files_dir)
def tuxedo_pipeline_cuffcompare_cuffquant(basedir,transcripts_output,annotation,reference,reference_fasta,cuffmerge_file):
    threads="4"
    process_name="cuffquant"
    #module_list=["cufflinks/2.2.0"]
    module_list=["cufflinks/2.2.0"]
    # makes a bunch of directories for file output:
    tuxedo_files_dir=basedir+"tuxedo/"
    cluster.check_dir(tuxedo_files_dir)
    #print tuxedo_files_dir
    tophat_dir=tuxedo_files_dir+"tophat/"
    cluster.check_dir(tophat_dir)
    #print tophat_dir
    cufflinks_dir=tuxedo_files_dir+"cufflinks/"
    cluster.check_dir(cufflinks_dir)
    #print cufflinks_dir
    #print os.path.isfile(cufflinks_output)
    cuffcompare_dir=tuxedo_files_dir+"cuffcompare/"
    cluster.check_dir(cuffcompare_dir)
    #print cuffcompare_dir
    cuffdiff_outputdir=tuxedo_files_dir+"cuffdiff/"
    cluster.check_dir(cuffdiff_outputdir)
    #print cuffdiff_outputdir
    cuffquant_dir=tuxedo_files_dir+"cuffquant/"
    cluster.check_dir(cuffquant_dir)
    #print cuffquant_dir
    cxb_output={}
    for sample in transcripts_output.keys():
        base_filename=sample
        tophat_outputdir=tophat_dir+base_filename+"/"
        duplicates_removed_bam=tuxedo.get_dupremoved_output(base_filename,tophat_outputdir)
        # 4. cuffcompare
        #cuffcompare_outputdir=cuffcompare_dir+base_filename+"/"
        #cluster.check_dir(cuffcompare_outputdir)
        #cuffcompare_string=tuxedo.get_cuffcompare_string(annotation,cuffcompare_outputdir,cufflinks_outputdir)
        # 6. cuffquant
        cuffquant_outputdir=cuffquant_dir+base_filename+"/"
        cluster.check_dir(cuffquant_outputdir)
        cuffquant_string=tuxedo.get_cuffquant_string(cuffquant_outputdir,duplicates_removed_bam,cuffmerge_file)
        # populates a dictionary with cuffquant filenames to be used in cuffdiff
        cxb_filename=tuxedo.get_cuffquant_cxb_output(cuffquant_outputdir)
        #cxb_output[sample]=duplicates_removed_bam
        cxb_output[sample]=cxb_filename
        # processes: tophat --> remove_duplicates (2 picardtools strings) --> cufflinks --> cuffcompare --> cuffmerge --> cuffquant --> cuffdiff
        process_list=[cuffquant_string]
        #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads)

    return cxb_output
## reference files:
reference="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/Bowtie2Index/genome"
annotation="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf"
reference_fasta="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/Bowtie2Index/genome.fa"

#reference="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome"
#annotation="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Annotation/Genes/genes.gtf"
#reference_fasta="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome.fa"

#annotation="/ifs/data/sequence/share/GTC/Steve/mm9.gtf"

# fastq files:
results_files_dir="/ifs/data/sequence/results/dasenlab/2014-08-27/fastq/"
files_dir="/ifs/home/cohenl06/data/sequencing/dasen/merged/"
basedir="/ifs/home/cohenl06/data/sequencing/dasen/"
cluster.check_dir(files_dir)
# this is the list of raw files we're working with:
fileslist=os.listdir(results_files_dir)

# this will take raw files from the original directory, merge, and output to subdirectory in my home
files=dasen_files.merge_files(results_files_dir,files_dir,fileslist)
#print files

# need to write a different function so that the merged_files function doesn't have to be run each time
# files need to be indexed in order with R1 and R2 by each sample

#transcripts_files=tuxedo_dasen.tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files,files_dir,annotation,reference,reference_fasta)
duplicates_removed_bam=tuxedo_dasen.tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files,files_dir,annotation,reference,reference_fasta)
print duplicates_removed_bam

igv.run_igvtools(basedir,duplicates_removed_bam)