def run_multiple_star(path, genome, outdir): samples = check_sample_files(path) if not os.path.exists(outdir): os.mkdir(outdir) print("[info] Create outdir in: {}".format(outdir)) script_lists = [] script_main_path = os.path.join(outdir, "work.sh") qsub_main_path = os.path.join(outdir, "qsub_work.sh") for sample in samples: name = sample[0] path = os.path.join(outdir, name) script_path = os.path.join(path, "work.sh") if not os.path.exists(path): os.mkdir(path) print("[info] Create outdir for sample {} in: {}".format(name, path)) else: print("[info] Outdir for sample {} exists in: {}".format(name, path)) if sample[2]: script_cmd = run_star(sample[1], sample[2], genome, path, False) else: script_cmd = run_star(sample[1],"",genome,path, False) with open(script_path, "w") as file: file.writelines("#!/bin/bash"+"\n"+script_cmd+"\n") print("[info] work script written in {}".format(script_path)) script_lists.append(script_path) # write the main script with open(script_main_path, "w") as file: file.writelines("\n".join(["bash " + x for x in script_lists])) with open(qsub_main_path, "w") as file: file.writelines("\n".join(["qsub -cwd -l vf=8g,p=8 " + x for x in script_lists])) print("[info] Main script written in {}".format(script_main_path))
def build_FPKM_table(processdir, samplefile, outpath): samples = check_sample_files(samplefile) sample_names = [sample[0] for sample in samples] fpkm_file_path = outpath + "fpkm.txt" qc_file_path = outpath + "qc.txt" fpkm = {} qc = ["\t".join(['sample', 'reads', 'mapped', 'ratio', 'genecounts'])] for sample in sample_names: #build fpkm table sample_fpkm = [] salmon_gene_path = os.path.join(processdir, sample, 'genes.fpkm_tracking') if not os.path.exists(salmon_gene_path): print("[info] File not exists for {}".format(sample)) continue with open(salmon_gene_path, 'r') as infile: infile.readline() for line in infile: infos = re.split("\t", line) gene = infos[4] sample_fpkm.append(float(infos[9])) if not gene in fpkm: fpkm[gene] = [float(infos[9])] else: fpkm[gene].append(float(infos[9])) #Genes Detected genes_FPKM_1 = sum([1 for x in sample_fpkm if x >= 1]) #build QC data metainfo = os.path.join(processdir, sample, 'flagstat.txt') with open(metainfo, "r") as file: qc_info = file.readlines() datas = [x.split(" ") for x in qc_info] qc_sample = [ sample, int(datas[0][0]), int(datas[4][0]), float(int(datas[4][0])) / int(datas[0][0]), genes_FPKM_1 ] qc.append("\t".join([str(x) for x in qc_sample])) #Write FPKM fpkm_file = open(fpkm_file_path, "w") fpkm_file.write("\t".join(["gene"] + sample_names) + "\n") for gene in fpkm: sum_fpkm_genes = sum(fpkm[gene]) if sum_fpkm_genes > 0: fpkm_file.write("\t".join([gene] + [str(x) for x in fpkm[gene]]) + "\n") fpkm_file.close() print("[info] FPKM file: {}".format(fpkm_file_path)) #Write QC Table qc_file = open(qc_file_path, "w") qc_file.writelines("\n".join(qc)) qc_file.close()
def QC(samplefile, fq1, fq2, out): print("[info] Qualtity Static of the Fastq Files ...") print("[info] The result file will be write to {}".format(out)) from baseq.fastq.quality import fastq_basecontent_quality from .sample_file import check_sample_files result = [] samples = check_sample_files(samplefile, "sample", fq1, fq2) print(samples) import xlsxwriter workbook = xlsxwriter.Workbook('QC.xlsx') workbook.formats[0].set_font_size(12) workbook.formats[0].set_font_name('arial') format_main = workbook.add_format({ 'bold': False, 'font_size': 12, 'font_name': 'arial' }) format_header = workbook.add_format({ 'bold': True, 'font_size': 15, 'font_name': 'arial' }) #prepare Page... qcpage = workbook.add_worksheet("Report") qcpage.set_column('D:D', 40) qcpage.set_column('E:E', 40) qcpage.write('A1', 'Sample', format_header) qcpage.write('B1', 'MeanQuality', format_header) qcpage.write('C1', 'BiasIndex', format_header) qcpage.write('D1', 'BasePlot', format_header) qcpage.write('E1', 'QualityPlot', format_header) #build the Excel... for idx, sample in enumerate(samples): print(idx, sample) result = fastq_basecontent_quality(sample[0], sample[1]) qcpage.set_row(idx + 1, 120) qcpage.write(idx + 1, 0, sample[0], format_main) qcpage.write(idx + 1, 1, result[2], format_main) qcpage.write(idx + 1, 2, result[3], format_main) qcpage.insert_image(idx + 1, 3, result[0], { "x_scale": 0.7, "y_scale": 0.7, 'x_offset': 5, 'y_offset': 5 }) qcpage.insert_image(idx + 1, 4, result[1], { "x_scale": 0.7, "y_scale": 0.7, 'x_offset': 5, 'y_offset': 5 }) workbook.close()
def filter_polyAT(samplefile, seqfile, fq1, fq2, name, thread): print("[info] Filter the Reads with polyA/polyT...") from .filter_reads import filter_fastq_pair_by_sequence from baseq.fastq.sample_file import check_sample_files samples = check_sample_files(samplefile, fq1, fq2) from concurrent.futures import ThreadPoolExecutor pool = ThreadPoolExecutor(int(thread)) print("[info] Using the Multiple Threads: {}".format(thread)) for sample in samples: pool.submit(filter_fastq_pair_by_sequence, sample[1], sample[2], seqfile, sample[0])
def run_multiple_salmons(samplefile, genome, processname, parallel): samples = check_sample_files(samplefile) if not os.path.exists(processname): os.mkdir(processname) pool = mp.Pool(processes=int(parallel)) for sample in samples: name = sample[0] path = os.path.join(processname, name) if not os.path.exists(path): os.mkdir(path) if sample[2]: script = "baseq-RNA run_salmon -1 {} -2 {} -g {} -n {}".format( sample[1], sample[2], genome, path) else: script = "baseq-RNA run_salmon -1 {} -g {} -n {}".format( sample[1], genome, path) pool.apply_async(run_cmd, ("Salmon", script)) pool.close() pool.join() print("[info] The All samples Are Processed.... Start Aggregating...") build_tpm_table(processname, samplefile, processname)
def build_tpm_table(processdir, samplefile, name): samples = check_sample_files(samplefile) sample_names = [sample[0] for sample in samples] tpm_file_path = "{}_TPM.txt".format(name) count_file_path = "{}_Count.txt".format(name) qc_file_path = "{}_QC.txt".format(name) print("[info] The files will write to : {}".format(tpm_file_path, count_file_path, qc_file_path)) tpm = {} count = {} qc = ["\t".join(['sample', 'reads', 'mapped', 'ratio', 'genecounts'])] for sample in sample_names: #build TPM table sample_TPM = [] salmon_gene_path = os.path.join(processdir, sample, 'quant.genes.sf') with open(salmon_gene_path, 'r') as infile: infile.readline() for line in infile: infos = re.split("\t", line) gene = infos[0] sample_TPM.append(float(infos[3])) if not gene in tpm: tpm[gene] = [float(infos[3])] count[gene] = [float(infos[4])] else: tpm[gene].append(float(infos[3])) count[gene].append(float(infos[4])) #Genes Detected genes_TPM_1 = sum([1 for x in sample_TPM if x >= 1]) #build QC data metainfo = os.path.join(processdir, sample, 'aux_info', 'meta_info.json') with open(metainfo, "r") as file: qc_info = json.load(file) qc_sample = [ sample, qc_info["num_processed"], qc_info["num_mapped"], qc_info["percent_mapped"], genes_TPM_1 ] qc.append("\t".join([str(x) for x in qc_sample])) #Write TPM tpm_file = open(tpm_file_path, "w") tpm_file.write("\t".join(["gene"] + sample_names) + "\n") for gene in tpm: sum_tpm_genes = sum(tpm[gene]) if sum_tpm_genes > 0: tpm_file.write("\t".join([gene] + [str(x) for x in tpm[gene]]) + "\n") tpm_file.close() #Write Counts count_file = open(count_file_path, "w") count_file.write("\t".join(["gene"] + sample_names) + "\n") for gene in count: sum_tpm_genes = sum(count[gene]) if sum_tpm_genes > 0: count_file.write("\t".join([gene] + [str(x) for x in count[gene]]) + "\n") count_file.close() #Write QC Table qc_file = open(qc_file_path, "w") qc_file.writelines("\n".join(qc)) qc_file.close()