Ejemplo n.º 1
0
def run_multiple_star(path, genome, outdir):
    samples = check_sample_files(path)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        print("[info] Create outdir in: {}".format(outdir))
    script_lists = []
    script_main_path = os.path.join(outdir, "work.sh")
    qsub_main_path = os.path.join(outdir, "qsub_work.sh")
    for sample in samples:
        name = sample[0]
        path = os.path.join(outdir, name)
        script_path = os.path.join(path, "work.sh")
        if not os.path.exists(path):
            os.mkdir(path)
            print("[info] Create outdir for sample {} in: {}".format(name, path))
        else:
            print("[info] Outdir for sample {} exists in: {}".format(name, path))
        if sample[2]:
            script_cmd = run_star(sample[1], sample[2], genome, path, False)
        else:
            script_cmd = run_star(sample[1],"",genome,path, False)
        with open(script_path, "w") as file:
            file.writelines("#!/bin/bash"+"\n"+script_cmd+"\n")
            print("[info] work script written in {}".format(script_path))
        script_lists.append(script_path)
     # write the main script
    with open(script_main_path, "w") as file:
        file.writelines("\n".join(["bash " + x for x in script_lists]))
    with open(qsub_main_path, "w") as file:
        file.writelines("\n".join(["qsub -cwd -l vf=8g,p=8 " + x for x in script_lists]))
    print("[info] Main script written in {}".format(script_main_path))
Ejemplo n.º 2
0
def build_FPKM_table(processdir, samplefile, outpath):
    samples = check_sample_files(samplefile)
    sample_names = [sample[0] for sample in samples]
    fpkm_file_path = outpath + "fpkm.txt"

    qc_file_path = outpath + "qc.txt"
    fpkm = {}
    qc = ["\t".join(['sample', 'reads', 'mapped', 'ratio', 'genecounts'])]
    for sample in sample_names:
        #build fpkm table
        sample_fpkm = []
        salmon_gene_path = os.path.join(processdir, sample,
                                        'genes.fpkm_tracking')
        if not os.path.exists(salmon_gene_path):
            print("[info] File not exists for {}".format(sample))
            continue
        with open(salmon_gene_path, 'r') as infile:
            infile.readline()
            for line in infile:
                infos = re.split("\t", line)
                gene = infos[4]
                sample_fpkm.append(float(infos[9]))
                if not gene in fpkm:
                    fpkm[gene] = [float(infos[9])]
                else:
                    fpkm[gene].append(float(infos[9]))

        #Genes Detected
        genes_FPKM_1 = sum([1 for x in sample_fpkm if x >= 1])

        #build QC data
        metainfo = os.path.join(processdir, sample, 'flagstat.txt')
        with open(metainfo, "r") as file:
            qc_info = file.readlines()
            datas = [x.split(" ") for x in qc_info]
            qc_sample = [
                sample,
                int(datas[0][0]),
                int(datas[4][0]),
                float(int(datas[4][0])) / int(datas[0][0]), genes_FPKM_1
            ]
            qc.append("\t".join([str(x) for x in qc_sample]))

    #Write FPKM
    fpkm_file = open(fpkm_file_path, "w")
    fpkm_file.write("\t".join(["gene"] + sample_names) + "\n")
    for gene in fpkm:
        sum_fpkm_genes = sum(fpkm[gene])
        if sum_fpkm_genes > 0:
            fpkm_file.write("\t".join([gene] + [str(x)
                                                for x in fpkm[gene]]) + "\n")
    fpkm_file.close()
    print("[info] FPKM file: {}".format(fpkm_file_path))

    #Write QC Table
    qc_file = open(qc_file_path, "w")
    qc_file.writelines("\n".join(qc))
    qc_file.close()
Ejemplo n.º 3
0
def QC(samplefile, fq1, fq2, out):
    print("[info] Qualtity Static of the Fastq Files ...")
    print("[info] The result file will be write to {}".format(out))
    from baseq.fastq.quality import fastq_basecontent_quality
    from .sample_file import check_sample_files
    result = []
    samples = check_sample_files(samplefile, "sample", fq1, fq2)
    print(samples)

    import xlsxwriter
    workbook = xlsxwriter.Workbook('QC.xlsx')
    workbook.formats[0].set_font_size(12)
    workbook.formats[0].set_font_name('arial')
    format_main = workbook.add_format({
        'bold': False,
        'font_size': 12,
        'font_name': 'arial'
    })
    format_header = workbook.add_format({
        'bold': True,
        'font_size': 15,
        'font_name': 'arial'
    })
    #prepare Page...
    qcpage = workbook.add_worksheet("Report")
    qcpage.set_column('D:D', 40)
    qcpage.set_column('E:E', 40)
    qcpage.write('A1', 'Sample', format_header)
    qcpage.write('B1', 'MeanQuality', format_header)
    qcpage.write('C1', 'BiasIndex', format_header)
    qcpage.write('D1', 'BasePlot', format_header)
    qcpage.write('E1', 'QualityPlot', format_header)

    #build the Excel...
    for idx, sample in enumerate(samples):
        print(idx, sample)
        result = fastq_basecontent_quality(sample[0], sample[1])
        qcpage.set_row(idx + 1, 120)
        qcpage.write(idx + 1, 0, sample[0], format_main)
        qcpage.write(idx + 1, 1, result[2], format_main)
        qcpage.write(idx + 1, 2, result[3], format_main)
        qcpage.insert_image(idx + 1, 3, result[0], {
            "x_scale": 0.7,
            "y_scale": 0.7,
            'x_offset': 5,
            'y_offset': 5
        })
        qcpage.insert_image(idx + 1, 4, result[1], {
            "x_scale": 0.7,
            "y_scale": 0.7,
            'x_offset': 5,
            'y_offset': 5
        })

    workbook.close()
Ejemplo n.º 4
0
def filter_polyAT(samplefile, seqfile, fq1, fq2, name, thread):
    print("[info] Filter the Reads with polyA/polyT...")
    from .filter_reads import filter_fastq_pair_by_sequence
    from baseq.fastq.sample_file import check_sample_files
    samples = check_sample_files(samplefile, fq1, fq2)
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(int(thread))
    print("[info] Using the Multiple Threads: {}".format(thread))
    for sample in samples:
        pool.submit(filter_fastq_pair_by_sequence, sample[1], sample[2],
                    seqfile, sample[0])
Ejemplo n.º 5
0
def run_multiple_salmons(samplefile, genome, processname, parallel):
    samples = check_sample_files(samplefile)
    if not os.path.exists(processname):
        os.mkdir(processname)
    pool = mp.Pool(processes=int(parallel))
    for sample in samples:
        name = sample[0]
        path = os.path.join(processname, name)
        if not os.path.exists(path):
            os.mkdir(path)
        if sample[2]:
            script = "baseq-RNA run_salmon -1 {} -2 {} -g {} -n {}".format(
                sample[1], sample[2], genome, path)
        else:
            script = "baseq-RNA run_salmon -1 {} -g {} -n {}".format(
                sample[1], genome, path)
        pool.apply_async(run_cmd, ("Salmon", script))
    pool.close()
    pool.join()
    print("[info] The All samples Are Processed.... Start Aggregating...")
    build_tpm_table(processname, samplefile, processname)
Ejemplo n.º 6
0
def build_tpm_table(processdir, samplefile, name):
    samples = check_sample_files(samplefile)
    sample_names = [sample[0] for sample in samples]
    tpm_file_path = "{}_TPM.txt".format(name)
    count_file_path = "{}_Count.txt".format(name)
    qc_file_path = "{}_QC.txt".format(name)
    print("[info] The files will write to : {}".format(tpm_file_path,
                                                       count_file_path,
                                                       qc_file_path))
    tpm = {}
    count = {}
    qc = ["\t".join(['sample', 'reads', 'mapped', 'ratio', 'genecounts'])]
    for sample in sample_names:
        #build TPM table
        sample_TPM = []
        salmon_gene_path = os.path.join(processdir, sample, 'quant.genes.sf')
        with open(salmon_gene_path, 'r') as infile:
            infile.readline()
            for line in infile:
                infos = re.split("\t", line)
                gene = infos[0]
                sample_TPM.append(float(infos[3]))
                if not gene in tpm:
                    tpm[gene] = [float(infos[3])]
                    count[gene] = [float(infos[4])]
                else:
                    tpm[gene].append(float(infos[3]))
                    count[gene].append(float(infos[4]))

        #Genes Detected
        genes_TPM_1 = sum([1 for x in sample_TPM if x >= 1])

        #build QC data
        metainfo = os.path.join(processdir, sample, 'aux_info',
                                'meta_info.json')
        with open(metainfo, "r") as file:
            qc_info = json.load(file)
            qc_sample = [
                sample, qc_info["num_processed"], qc_info["num_mapped"],
                qc_info["percent_mapped"], genes_TPM_1
            ]
            qc.append("\t".join([str(x) for x in qc_sample]))

    #Write TPM
    tpm_file = open(tpm_file_path, "w")
    tpm_file.write("\t".join(["gene"] + sample_names) + "\n")
    for gene in tpm:
        sum_tpm_genes = sum(tpm[gene])
        if sum_tpm_genes > 0:
            tpm_file.write("\t".join([gene] + [str(x)
                                               for x in tpm[gene]]) + "\n")
    tpm_file.close()

    #Write Counts
    count_file = open(count_file_path, "w")
    count_file.write("\t".join(["gene"] + sample_names) + "\n")
    for gene in count:
        sum_tpm_genes = sum(count[gene])
        if sum_tpm_genes > 0:
            count_file.write("\t".join([gene] + [str(x)
                                                 for x in count[gene]]) + "\n")
    count_file.close()

    #Write QC Table
    qc_file = open(qc_file_path, "w")
    qc_file.writelines("\n".join(qc))
    qc_file.close()