コード例 #1
0
ファイル: star.py プロジェクト: thomasly/rnaSeqAnalysis
def mapping():
    """
    Run mapping job
    """

    paths = RnaSeqPath()
    try:
        os.mkdir(paths.star_outputs)
    except IOError:
        pass

    n_threads = os.cpu_count()
    reads = get_paired_reads()[int(sys.argv[2]) - 1]
    out_put_prefix = os.path.basename(reads[0]).split("R1")[0]
    option_dic = {
        "--runThreadN": n_threads,
        "--genomeDir": paths.hg38_l1_root,
        "--readFilesIn": "{} {}".format(reads[0], reads[1]),
        "--outFileNamePrefix": os.path.join(paths.star_outputs,
                                            out_put_prefix),
        "--outSAMtype": "BAM Unsorted",
        "--genomeLoad": "NoSharedMemory"
        # "--readFilesCommand" : "zcat"
    }

    option = dic_to_string(option_dic)

    command = "STAR {option}".format(option=option)
    os.system(command)
コード例 #2
0
ファイル: htseq_qsub.py プロジェクト: thomasly/rnaSeqAnalysis
def htseq_count():
    """
    """

    paths = RnaSeqPath()

    try:
        os.mkdir(paths.htseq_outputs)
    except OSError:
        pass

    bam_files = glob(os.path.join(paths.samtools_sorted, "*sorted.bam.bam"))
    # alignment_files = " ".join(bam_files)
    gff_file = paths.hg38_l1_annotation

    for i, alignment_file in enumerate(bam_files):
        output_file = os.path.join(
            paths.htseq_outputs, "{}_htseq.out".format(
                os.path.basename(alignment_file).split(".")[0]))
        shell_file = generate_bash_file(
            filename_base="htseq",
            job_name="htseq_job_{}".format(i),
            commands=[
                "module load anaconda3", "source activate htseq",
                "htseq-count -f bam -r name {} {} > {}".format(
                    alignment_file, gff_file, output_file)
            ])
        qsub(shell_file)
コード例 #3
0
def trimmomatic_qsub():
    """
    qsub trimmomatic mission through trimmomatic_sub.sh
    """

    paths = RnaSeqPath()
    # make the dir for outputs
    try:
        os.mkdir(paths.trimmomatic_outputs)
    except IOError:
        pass
    # calculate job number based on fastq files number
    n_jobs = int(len(glob(os.path.join(paths.fastq, "*.fastq.gz"))) / 2)
    shell_file = generate_bash_file(
        filename_base="trim",
        job_name="rnaSeqTrimmomatic",
        threads=2,
        job_arr=n_jobs,
        commands=[
            "module load python/3.6.4",
            "python3 trimmomatic.py $1 $SGE_TASK_ID"
        ]   
    )
    qsub(shell_file, [paths.adapterfa])
    qsub(clean(after="rnaSeqTrimmomatic"))
コード例 #4
0
ファイル: samtools.py プロジェクト: thomasly/rnaSeqAnalysis
def samtools_filtering():
    """
    """

    paths = RnaSeqPath()
    try:
        os.mkdir(paths.samtools_outputs)
    except OSError:
        pass

    bam_file = get_file_name(int(sys.argv[1]) - 1)
    output_file = os.path.basename(bam_file).split(".")[0] + "_mapped.bam"
    output_file = os.path.join(paths.samtools_outputs, output_file)
    command = "samtools view -b -F 4 {} > {}".format(bam_file, output_file)
    os.system(command)
コード例 #5
0
ファイル: fastqc.py プロジェクト: thomasly/rnaSeqAnalysis
def main():
    """
    run fastqc with multithreads
    """
    start = datetime.now()
    paths = RnaSeqPath()
    data_path = paths.fastq

    # create output folder
    outputs_path = paths.fastqc_outputs
    try:
        os.mkdir(outputs_path)
    except IOError:
        pass

    # get the file paths
    fastq_ext = os.path.join(data_path, "*.fastq.gz")
    file_names = glob(fastq_ext)
    # get the files to run in parallel
    batch_size = mp.cpu_count()
    n_batch = int(sys.argv[1]) # argn[1]th batch
    # get the indeces of the start and end of this batch
    f_start = batch_size * n_batch
    f_end = min(batch_size * (n_batch + 1), len(file_names))
    # get file paths in this batch
    try:
        batch_files = file_names[f_start:f_end]
    except IndexError:
        print("IndexError raised!")
        return

    # initialize multiprocessing pool
    pool = mp.Pool(mp.cpu_count())
    # create tasks for multiprocessing
    tasks = []
    for f in batch_files:
        command = "fastqc -o {} {}".format(outputs_path, f)
        tasks.append(command)

    # run multiprocessing
    pool.map(lambda x: os.system(x), tasks)
    pool.close()
    pool.join()

    # print time consumed
    end = datetime.now()
    print("Time consumed: {}".format(end - start))
コード例 #6
0
def clean(after=None):
    """
    generate the shell commands to clean up temp files

    input:
    after - job name. Cleaning starts after this job is done

    output:
    shell file path containing cleaning commands
    """

    paths = RnaSeqPath()
    shell_file = generate_bash_file(
        hold_jid=after,
        commands=["rm -f ./*temp", "rm -f {}/*".format(paths.temp)])

    return shell_file
コード例 #7
0
ファイル: star.py プロジェクト: thomasly/rnaSeqAnalysis
def generate_genome_index():
    """
    Run STAR genome index generation
    """

    n_threads = os.cpu_count()
    paths = RnaSeqPath()
    # arguments for STAR
    option_dic = {
        "--runThreadN": n_threads,
        "--runMode": "genomeGenerate",
        "--genomeDir": paths.hg38_l1_root,
        "--genomeFastaFiles": paths.hg38_l1_fasta,
        "--sjdbGTFfile": paths.hg38_l1_annotation,
        "--sjdbOverhang": 150
    }
    # transform the arguments from dict to string
    option = dic_to_string(option_dic)

    # create shell command and deliver it
    command = "STAR {option}".format(option=option)
    os.system(command)
コード例 #8
0
def samtools_qsub(opt):
    """
    """

    paths = RnaSeqPath()
    if opt == "filtering":
        bam_files = os.path.join(paths.star_outputs, "*.bam")
    elif opt == "sorting":
        bam_files = os.path.join(paths.samtools_outputs, "*.bam")
    n_jobs = len(glob(bam_files))

    shell_file = generate_bash_file(
        filename_base="samtools",
        job_name="samtools_{}".format(opt),
        job_arr=n_jobs,
        commands=[
            "module load samtools/0.1.19", "module load python/3.6.4",
            "python3 {} $SGE_TASK_ID {}".format(
                os.path.join(paths.scripts, "samtools.py"), opt)
        ])
    qsub(shell_file)
    qsub(clean(after="samtools_{}".format(opt)))
コード例 #9
0
ファイル: samtools.py プロジェクト: thomasly/rnaSeqAnalysis
def get_file_name(index):
    """
    """
    paths = RnaSeqPath()
    try:
        f = open("sam_temp", "br")
    except OSError:
        pass
    else:
        with f:
            files = pk.load(f)
        return files[index]

    files = glob(os.path.join(paths.star_outputs, "*.bam"))
    try:
        f = open("sam_temp", "bw")
    except OSError:
        raise
    else:
        with f:
            pk.dump(files, f)

    return files[index]
コード例 #10
0
ファイル: samtools.py プロジェクト: thomasly/rnaSeqAnalysis
def samtools_sorting():
    """
    """
    paths = RnaSeqPath()
    try:
        os.mkdir(paths.samtools_sorted)
    except OSError:
        pass

    if os.path.exists("samtools_sort_temp"):
        with open("samtools_sort_temp", "br") as f:
            files = pk.load(f)
    else:
        files = glob(os.path.join(paths.samtools_outputs, "*mapped.bam"))
        with open("samtools_sort_temp", "bw") as f:
            pk.dump(files, f)

    bam_file = files[int(sys.argv[1]) - 1]
    output_file = os.path.basename(bam_file).split(".")[0] + "_sorted.bam"
    output_file = os.path.join(paths.samtools_sorted, output_file)

    command = "samtools sort -n -f {} {}".format(bam_file, output_file)
    os.system(command)
コード例 #11
0
import os, sys
from utils import generate_bash_file, qsub, clean
from paths import RnaSeqPath
from glob import glob
from math import ceil

paths = RnaSeqPath()

# create the shell file for qsub
sh_file = generate_bash_file(
    filename_base="fastqc",
    job_name="rnaSeqFastqc",
    commands=[
        "module load fastqc",
        "module load python/3.6.4",
        "python3 fastqc.py $1"
    ]
)

# calculate how many qsub to run
threads = os.cpu_count()
m = int(ceil(len(glob(os.path.join(paths.fastq, "*.fastq.gz"))) / threads))

# submit jobs to hpc
for t in range(m):
    qsub(sh_file, [t])

# cleaning the temporary shell files after main job is done
qsub(clean(after="rnaSeqFastqc"))
コード例 #12
0
ファイル: star.py プロジェクト: thomasly/rnaSeqAnalysis
def get_paired_reads(path=None):
    """
    find the paired end RNAseq forward and reverse reading pair

    arguments:
    path - folder contains input .fastq files

    return:
    array contains all inputs paired in tuples
    """

    paths = RnaSeqPath()
    # open the 'star_temp' file and return the file pair list
    # if other process has created it
    try:
        f = open(os.path.join(paths.temp, 'star_temp'), 'rb')
    except OSError:
        pass
    else:
        with f:
            paired_files = pk.load(f)
        return paired_files

    # create the paired file list, write it into a file, and return it

    # get the paths to all cleaned fastq files
    if path:
        files = glob(os.path.join(path, "*.cleaned.fastq"))
    else:
        files = glob(os.path.join(paths.trimmomatic_outputs,
                                  "*.cleaned.fastq"))

    paired_files = []
    # find paired files
    loop_monitor = 0
    n_max_loop = len(files) / 2 + 1
    while files and loop_monitor < n_max_loop:
        # find the forward file, save file name, remove it from files list
        for idx, f in enumerate(files):
            if "R1" in f:
                f1 = f
                files.pop(idx)
                break
        # get file name
        f1_base = os.path.basename(f1)
        # the indentical part in the file names of the paired files
        pair_id = f1_base.split(".")[0][:-7]
        for idx, f2 in enumerate(files):
            if pair_id in f2 and "cleaned" in f2:
                paired_files.append(tuple((f1, f2)))
                files.pop(idx)
                break
        # used to break the while loop if the files are not properly paired
        # by mistake
        loop_monitor += 1

    # write the list of paired files into a file
    try:
        f = open(os.path.join(paths.temp, "star_temp"), "wb")
    except IOError:
        raise
    else:
        with f:
            pk.dump(paired_files, f)

    return paired_files
コード例 #13
0
def generate_bash_file(filename_base="_qsub_temp",
                       job_name=None,
                       threads=None,
                       mem_free=None,
                       job_arr=None,
                       out_log=None,
                       err_log=None,
                       hold_jid=None,
                       commands=[]):
    """
    automatically generate bash file for qsub

    inputs:
    possible qsub options
    commands - list of shell commands

    retrun:
    full path to the file.
    """
    paths = RnaSeqPath()
    temp = paths.temp
    try:
        os.mkdir(temp)
    except IOError:
        pass

    # add unique timestamp to shell file name
    timestamp = str(datetime.now().day) + \
                str(datetime.now().hour) + \
                str(datetime.now().minute) + \
                str(datetime.now().microsecond)
    filename = filename_base + "_" + timestamp + ".sh"
    file_path = os.path.join(temp, filename)

    # create file
    with open(file_path, "w") as f:
        f.write("#!/bin/bash\n")

        if job_name:
            string = "#$ -N {}\n".format(job_name)
            f.write(string)
        if threads:
            string = "#$ -pe openmpi 1-{}\n".format(threads)
            f.write(string)
        if mem_free:
            string = "#$ -l mem_free={}\n".format(mem_free)
            f.write(string)
        if job_arr:
            # string = "#$ -t {}\n".format(job_arr)
            string = "#$ -t 1-{}\n".format(job_arr)
            f.write(string)

        if out_log:
            string = "#$ -o {}\n".format(
                os.path.join(paths.qsub_outputs, out_log))
        else:
            string = "#$ -o {}\n".format(
                os.path.join(paths.qsub_outputs, "$JOB_NAME_$JOB_ID.out"))
        f.write(string)

        if err_log:
            string = "#$ -e {}\n".format(
                os.path.join(paths.qsub_outputs, out_log))
        else:
            string = "#$ -e {}\n".format(
                os.path.join(paths.qsub_outputs, "$JOB_NAME_$JOB_ID.err"))
        f.write(string)

        if hold_jid:
            string = "#$ -hold_jid {}\n".format(hold_jid)
            f.write(string)

        f.write('\n'.join(commands))

    return file_path
コード例 #14
0
def star_qsub(job):
    """
    qsub star mission through star_sub.sh

    input:
    job - string, "indexing" or "mapping". choose the job you want to run.

    outputs:
    indexing - genome index files in the genome folder
    mapping - sam and log files in star_outputs folder
    """

    paths = RnaSeqPath()
    # STAR genome indexing job
    if job == "indexing":
        # create the shell file
        shell_file = generate_bash_file(
            job_name="star_indexing",
            threads=4,
            out_log="star_indexing.out",
            err_log="star_indexing.err",
            commands=[
                "module load star", "module load python/3.6.4",
                "python3 {} indexing".format(
                    os.path.join(paths.scripts, 'star.py'))
            ])
        # submit the shell file to hpc
        qsub(shell_file)

    # STAR RNA-seq alignment job
    if job == "mapping":

        try:
            os.mkdir(paths.star_outputs)
        except IOError:
            pass

        # shell commands
        commands = [
            "module load star", "module load python/3.6.4",
            "python3 {} mapping $SGE_TASK_ID".format(
                os.path.join(paths.scripts, 'star.py'))
        ]

        # calculate job number based on the trimmomatic outputs
        n_jobs = int(
            len(
                glob(os.path.join(paths.trimmomatic_outputs,
                                  "*.cleaned.fastq"))) / 2)

        # create shell file
        shell_file = generate_bash_file(job_name="star_mapping",
                                        mem_free="35G",
                                        threads=8,
                                        job_arr=n_jobs,
                                        commands=commands)
        # submit shell file to hpc
        qsub(shell_file)

        # clean temp files
        qsub(clean(after="star_mapping"))