def run_qc(project, code='qc', nthreads=1): """ Runs read quantification with RSeQC, FastQC and MultiQC on each sample. """ print_with_flush('# starting qc for project {}'.format(project.objectId)) # First, make sure that environment variables are set. os.environ['LC_ALL'] = 'C.UTF-8' os.environ['LANG'] = 'C.UTF-8' # Get samples from project. samples = project.relation('samples').query() for sample in samples: print_with_flush('# starting qc for sample {}'.format(sample.name)) qc(sample, nthreads=nthreads) # Run multiqc for the entire project. args = ['multiqc', project.paths[code]] args += ['--ignore', '*.sam'] args += ['--ignore', 'qc_out.txt'] args += ['-o', project.paths[code]] args += ['-f'] run_sys(args, prefix=sample.name) # Archive. archive_path = archive(project, code) if code not in project.files: project.files[code] = {} project.files[code]['multiqc'] = os.path.join(project.paths[code], 'multiqc_report.html') project.files[code]['archive'] = archive_path project.save() print_with_flush('# done')
def samtools_index(sample, code='qc', nthreads=1): """ Helper function to call samtools to index .bam """ sorted_path = sample.files[code]['sortedAlignments'] args = ['samtools', 'index', sorted_path] args += ['-@', str(nthreads - 1)] run_sys(args, prefix=sample.name)
def fastqc(sample, code='qc'): """ Helper function to run fastqc. """ sorted_path = sample.files[code]['sortedAlignments'] args = ['fastqc', sorted_path] run_sys(args, prefix=sample.name) return {}
def multiqc(sample, code='qc'): """ Helper function to run multiqc. """ args = ['multiqc', sample.paths[code]] args += ['--ignore', '*.sam'] args += ['--ignore', 'qc_out.txt'] args += ['-o', sample.paths[code]] args += ['-f'] run_sys(args, prefix=sample.name) return {'multiqc': os.path.join(sample.paths[code], 'multiqc_report.html')}
def samtools_sort(sample, code='qc', nthreads=1): """ Helper function to call samtools to sort .bam """ alignments_path = sample.files[code]['alignments'] args = ['samtools', 'sort', alignments_path] sorted_file = '{}_sorted.bam'.format(sample.name) sorted_path = os.path.join(sample.paths[code], sorted_file) args += ['-o', sorted_path] args += ['-@', str(nthreads - 1)] args += ['-m', '2G'] run_sys(args, prefix=sample.name) return {'sortedAlignments': sorted_path}
def geneBody_coverage(sample, code='qc'): """ Helper function to run geneBody_coverage.py """ sorted_path = sample.files[code]['sortedAlignments'] bed_path = sample.reference.paths['bed'] coverage_file = '{}_coverage'.format(sample.name) coverage_path = os.path.join(sample.paths[code], coverage_file) args = ['geneBody_coverage.py'] args += ['-i', sorted_path] args += ['-r', bed_path] args += ['-o', coverage_path] run_sys(args, prefix=sample.name) return {'coverage': coverage_path}
def bowtie2(sample, code='qc', nthreads=1): """ Helper function to call bowtie2 alignment. """ upto = 10**5 # Various path variables. alignments_file = '{}_alignments.sam'.format(sample.name) alignments_path = os.path.join(sample.paths[code], alignments_file) info_file = '{}_align_info.txt'.format(sample.name) info_path = os.path.join(sample.paths[code], info_file) args = ['bowtie2', '-x', sample.reference.paths['bowtieIndex']] # Fetch all reads for this sample. reads = {read.objectId: read for read in sample.relation('reads').query()} # single/paired-end if sample.readType == 'single': args += ['-U', ','.join(read.path for _, read in reads.items())] elif sample.readType == 'paired': pairs = sample.readPairs m1 = [] m2 = [] for pair in pairs: m1.append(reads[pair[0]].path) m2.append(reads[pair[1]].path) args += ['-1', ','.join(m1)] args += ['-2', ','.join(m2)] args += ['-S', alignments_path] args += ['-u', str(upto)] args += ['--threads', str(nthreads)] args += ['--verbose'] output = run_sys(args, prefix=sample.name) # Write bowtie stderr output. first = '{} reads; of these'.format(upto) last = 'overall alignment rate' found = False bt2_info = '' for line in output.split('\n'): if first in line: found = True if found: bt2_info += line + '\n' if last in line: break # Write the file. with open(info_path, 'w') as f: f.write(bt2_info) # Return dictionary of files. return {'alignments': alignments_path, 'alignInfo': info_path}
def tin(sample, code='qc'): """ Helper function to run tin.py """ sorted_path = sample.files[code]['sortedAlignments'] bed_path = sample.reference.paths['bed'] args = ['tin.py'] args += ['-i', sorted_path] args += ['-r', bed_path] tin_file = '{}_tin.txt'.format(sample.name) tin_path = os.path.join(sample.paths[code], tin_file) output = run_sys(args, prefix=sample.name) # output file with open(tin_path, 'w') as out: out.write(output) return {'tin': tin_path}
def read_distribution(sample, code='qc'): """ Helper function to run read_distribution.py """ sorted_path = sample.files[code]['sortedAlignments'] bed_path = sample.reference.paths['bed'] args = ['read_distribution.py'] args += ['-i', sorted_path] args += ['-r', bed_path] output = run_sys(args, prefix=sample.name) distribution_file = '{}_distribution.txt'.format(sample.name) distribution_path = os.path.join(sample.paths[code], distribution_file) # output file with open(distribution_path, 'w') as out: out.write(output) return {'distribution': distribution_path}
def build_kallisto(reference, nthreads=1): """ Builds kallisto index. """ print('building kallisto index', file=sys.stderr) logfile = os.path.join(reference.paths['root'], 'kallisto_log.txt') if os.path.isfile(logfile): os.remove(logfile) cdna_path = reference.paths['cdna'] out_path = reference.paths['kallistoIndex'] # Make output directory. os.makedirs(os.path.dirname(out_path), exist_ok=True) args = ['kallisto', 'index', '-i', out_path, cdna_path] output = run_sys(args, prefix='kallisto', file=logfile) # if execution comes here, the command ran successfully with open(logfile, 'a') as f: f.write('# success')
def build_bowtie2(reference, nthreads=1): """ Builds bowtie2 index. """ print('building bowtie index', file=sys.stderr) logfile = os.path.join(reference.paths['root'], 'bowtie2_log.txt') if os.path.isfile(logfile): os.remove(logfile) dna_path = reference.paths['dna'] out_path = reference.paths['bowtieIndex'] # Make output directory. os.makedirs(os.path.dirname(out_path), exist_ok=True) args = ['bowtie2-build', dna_path, out_path, '--threads', nthreads] output = run_sys(args, prefix='bowtie2', file=logfile) # if execution comes here, the command ran successfully with open(logfile, 'a') as f: f.write('# success')