def mapping(): """ Run mapping job """ paths = RnaSeqPath() try: os.mkdir(paths.star_outputs) except IOError: pass n_threads = os.cpu_count() reads = get_paired_reads()[int(sys.argv[2]) - 1] out_put_prefix = os.path.basename(reads[0]).split("R1")[0] option_dic = { "--runThreadN": n_threads, "--genomeDir": paths.hg38_l1_root, "--readFilesIn": "{} {}".format(reads[0], reads[1]), "--outFileNamePrefix": os.path.join(paths.star_outputs, out_put_prefix), "--outSAMtype": "BAM Unsorted", "--genomeLoad": "NoSharedMemory" # "--readFilesCommand" : "zcat" } option = dic_to_string(option_dic) command = "STAR {option}".format(option=option) os.system(command)
def htseq_count(): """ """ paths = RnaSeqPath() try: os.mkdir(paths.htseq_outputs) except OSError: pass bam_files = glob(os.path.join(paths.samtools_sorted, "*sorted.bam.bam")) # alignment_files = " ".join(bam_files) gff_file = paths.hg38_l1_annotation for i, alignment_file in enumerate(bam_files): output_file = os.path.join( paths.htseq_outputs, "{}_htseq.out".format( os.path.basename(alignment_file).split(".")[0])) shell_file = generate_bash_file( filename_base="htseq", job_name="htseq_job_{}".format(i), commands=[ "module load anaconda3", "source activate htseq", "htseq-count -f bam -r name {} {} > {}".format( alignment_file, gff_file, output_file) ]) qsub(shell_file)
def trimmomatic_qsub(): """ qsub trimmomatic mission through trimmomatic_sub.sh """ paths = RnaSeqPath() # make the dir for outputs try: os.mkdir(paths.trimmomatic_outputs) except IOError: pass # calculate job number based on fastq files number n_jobs = int(len(glob(os.path.join(paths.fastq, "*.fastq.gz"))) / 2) shell_file = generate_bash_file( filename_base="trim", job_name="rnaSeqTrimmomatic", threads=2, job_arr=n_jobs, commands=[ "module load python/3.6.4", "python3 trimmomatic.py $1 $SGE_TASK_ID" ] ) qsub(shell_file, [paths.adapterfa]) qsub(clean(after="rnaSeqTrimmomatic"))
def samtools_filtering(): """ """ paths = RnaSeqPath() try: os.mkdir(paths.samtools_outputs) except OSError: pass bam_file = get_file_name(int(sys.argv[1]) - 1) output_file = os.path.basename(bam_file).split(".")[0] + "_mapped.bam" output_file = os.path.join(paths.samtools_outputs, output_file) command = "samtools view -b -F 4 {} > {}".format(bam_file, output_file) os.system(command)
def main(): """ run fastqc with multithreads """ start = datetime.now() paths = RnaSeqPath() data_path = paths.fastq # create output folder outputs_path = paths.fastqc_outputs try: os.mkdir(outputs_path) except IOError: pass # get the file paths fastq_ext = os.path.join(data_path, "*.fastq.gz") file_names = glob(fastq_ext) # get the files to run in parallel batch_size = mp.cpu_count() n_batch = int(sys.argv[1]) # argn[1]th batch # get the indeces of the start and end of this batch f_start = batch_size * n_batch f_end = min(batch_size * (n_batch + 1), len(file_names)) # get file paths in this batch try: batch_files = file_names[f_start:f_end] except IndexError: print("IndexError raised!") return # initialize multiprocessing pool pool = mp.Pool(mp.cpu_count()) # create tasks for multiprocessing tasks = [] for f in batch_files: command = "fastqc -o {} {}".format(outputs_path, f) tasks.append(command) # run multiprocessing pool.map(lambda x: os.system(x), tasks) pool.close() pool.join() # print time consumed end = datetime.now() print("Time consumed: {}".format(end - start))
def clean(after=None): """ generate the shell commands to clean up temp files input: after - job name. Cleaning starts after this job is done output: shell file path containing cleaning commands """ paths = RnaSeqPath() shell_file = generate_bash_file( hold_jid=after, commands=["rm -f ./*temp", "rm -f {}/*".format(paths.temp)]) return shell_file
def generate_genome_index(): """ Run STAR genome index generation """ n_threads = os.cpu_count() paths = RnaSeqPath() # arguments for STAR option_dic = { "--runThreadN": n_threads, "--runMode": "genomeGenerate", "--genomeDir": paths.hg38_l1_root, "--genomeFastaFiles": paths.hg38_l1_fasta, "--sjdbGTFfile": paths.hg38_l1_annotation, "--sjdbOverhang": 150 } # transform the arguments from dict to string option = dic_to_string(option_dic) # create shell command and deliver it command = "STAR {option}".format(option=option) os.system(command)
def samtools_qsub(opt): """ """ paths = RnaSeqPath() if opt == "filtering": bam_files = os.path.join(paths.star_outputs, "*.bam") elif opt == "sorting": bam_files = os.path.join(paths.samtools_outputs, "*.bam") n_jobs = len(glob(bam_files)) shell_file = generate_bash_file( filename_base="samtools", job_name="samtools_{}".format(opt), job_arr=n_jobs, commands=[ "module load samtools/0.1.19", "module load python/3.6.4", "python3 {} $SGE_TASK_ID {}".format( os.path.join(paths.scripts, "samtools.py"), opt) ]) qsub(shell_file) qsub(clean(after="samtools_{}".format(opt)))
def get_file_name(index): """ """ paths = RnaSeqPath() try: f = open("sam_temp", "br") except OSError: pass else: with f: files = pk.load(f) return files[index] files = glob(os.path.join(paths.star_outputs, "*.bam")) try: f = open("sam_temp", "bw") except OSError: raise else: with f: pk.dump(files, f) return files[index]
def samtools_sorting(): """ """ paths = RnaSeqPath() try: os.mkdir(paths.samtools_sorted) except OSError: pass if os.path.exists("samtools_sort_temp"): with open("samtools_sort_temp", "br") as f: files = pk.load(f) else: files = glob(os.path.join(paths.samtools_outputs, "*mapped.bam")) with open("samtools_sort_temp", "bw") as f: pk.dump(files, f) bam_file = files[int(sys.argv[1]) - 1] output_file = os.path.basename(bam_file).split(".")[0] + "_sorted.bam" output_file = os.path.join(paths.samtools_sorted, output_file) command = "samtools sort -n -f {} {}".format(bam_file, output_file) os.system(command)
import os, sys from utils import generate_bash_file, qsub, clean from paths import RnaSeqPath from glob import glob from math import ceil paths = RnaSeqPath() # create the shell file for qsub sh_file = generate_bash_file( filename_base="fastqc", job_name="rnaSeqFastqc", commands=[ "module load fastqc", "module load python/3.6.4", "python3 fastqc.py $1" ] ) # calculate how many qsub to run threads = os.cpu_count() m = int(ceil(len(glob(os.path.join(paths.fastq, "*.fastq.gz"))) / threads)) # submit jobs to hpc for t in range(m): qsub(sh_file, [t]) # cleaning the temporary shell files after main job is done qsub(clean(after="rnaSeqFastqc"))
def get_paired_reads(path=None): """ find the paired end RNAseq forward and reverse reading pair arguments: path - folder contains input .fastq files return: array contains all inputs paired in tuples """ paths = RnaSeqPath() # open the 'star_temp' file and return the file pair list # if other process has created it try: f = open(os.path.join(paths.temp, 'star_temp'), 'rb') except OSError: pass else: with f: paired_files = pk.load(f) return paired_files # create the paired file list, write it into a file, and return it # get the paths to all cleaned fastq files if path: files = glob(os.path.join(path, "*.cleaned.fastq")) else: files = glob(os.path.join(paths.trimmomatic_outputs, "*.cleaned.fastq")) paired_files = [] # find paired files loop_monitor = 0 n_max_loop = len(files) / 2 + 1 while files and loop_monitor < n_max_loop: # find the forward file, save file name, remove it from files list for idx, f in enumerate(files): if "R1" in f: f1 = f files.pop(idx) break # get file name f1_base = os.path.basename(f1) # the indentical part in the file names of the paired files pair_id = f1_base.split(".")[0][:-7] for idx, f2 in enumerate(files): if pair_id in f2 and "cleaned" in f2: paired_files.append(tuple((f1, f2))) files.pop(idx) break # used to break the while loop if the files are not properly paired # by mistake loop_monitor += 1 # write the list of paired files into a file try: f = open(os.path.join(paths.temp, "star_temp"), "wb") except IOError: raise else: with f: pk.dump(paired_files, f) return paired_files
def generate_bash_file(filename_base="_qsub_temp", job_name=None, threads=None, mem_free=None, job_arr=None, out_log=None, err_log=None, hold_jid=None, commands=[]): """ automatically generate bash file for qsub inputs: possible qsub options commands - list of shell commands retrun: full path to the file. """ paths = RnaSeqPath() temp = paths.temp try: os.mkdir(temp) except IOError: pass # add unique timestamp to shell file name timestamp = str(datetime.now().day) + \ str(datetime.now().hour) + \ str(datetime.now().minute) + \ str(datetime.now().microsecond) filename = filename_base + "_" + timestamp + ".sh" file_path = os.path.join(temp, filename) # create file with open(file_path, "w") as f: f.write("#!/bin/bash\n") if job_name: string = "#$ -N {}\n".format(job_name) f.write(string) if threads: string = "#$ -pe openmpi 1-{}\n".format(threads) f.write(string) if mem_free: string = "#$ -l mem_free={}\n".format(mem_free) f.write(string) if job_arr: # string = "#$ -t {}\n".format(job_arr) string = "#$ -t 1-{}\n".format(job_arr) f.write(string) if out_log: string = "#$ -o {}\n".format( os.path.join(paths.qsub_outputs, out_log)) else: string = "#$ -o {}\n".format( os.path.join(paths.qsub_outputs, "$JOB_NAME_$JOB_ID.out")) f.write(string) if err_log: string = "#$ -e {}\n".format( os.path.join(paths.qsub_outputs, out_log)) else: string = "#$ -e {}\n".format( os.path.join(paths.qsub_outputs, "$JOB_NAME_$JOB_ID.err")) f.write(string) if hold_jid: string = "#$ -hold_jid {}\n".format(hold_jid) f.write(string) f.write('\n'.join(commands)) return file_path
def star_qsub(job): """ qsub star mission through star_sub.sh input: job - string, "indexing" or "mapping". choose the job you want to run. outputs: indexing - genome index files in the genome folder mapping - sam and log files in star_outputs folder """ paths = RnaSeqPath() # STAR genome indexing job if job == "indexing": # create the shell file shell_file = generate_bash_file( job_name="star_indexing", threads=4, out_log="star_indexing.out", err_log="star_indexing.err", commands=[ "module load star", "module load python/3.6.4", "python3 {} indexing".format( os.path.join(paths.scripts, 'star.py')) ]) # submit the shell file to hpc qsub(shell_file) # STAR RNA-seq alignment job if job == "mapping": try: os.mkdir(paths.star_outputs) except IOError: pass # shell commands commands = [ "module load star", "module load python/3.6.4", "python3 {} mapping $SGE_TASK_ID".format( os.path.join(paths.scripts, 'star.py')) ] # calculate job number based on the trimmomatic outputs n_jobs = int( len( glob(os.path.join(paths.trimmomatic_outputs, "*.cleaned.fastq"))) / 2) # create shell file shell_file = generate_bash_file(job_name="star_mapping", mem_free="35G", threads=8, job_arr=n_jobs, commands=commands) # submit shell file to hpc qsub(shell_file) # clean temp files qsub(clean(after="star_mapping"))