def qc_check(i): allFiles = os.listdir(in_dir + "/" + i ) pairedReads_temp = [allFiles[y] for y, x in enumerate(allFiles) if re.findall("_R2", x)] functions.make_sure_path_exists(out_dir+'/'+i) os.system("fastqc " + in_dir + "/" + i + "/" + i + "*_R1*.fastq" + gz + " --outdir=" + out_dir + "/" + i + " --nogroup --extract ") if pairedReads_temp: os.system("fastqc " + in_dir + "/" + i + "/" + i + "*_R2*.fastq" + gz + " --outdir=" + out_dir + "/" + i + " --nogroup --extract")
def qc_check(i): allFiles = os.listdir(in_dir + "/" + i ) pairedReads_temp = [allFiles[y] for y, x in enumerate(allFiles) if re.findall("_R2", x)] functions.make_sure_path_exists(out_dir+'/'+i) os.system("fastqc " + in_dir + "/" + i + "/" + i + "*_R1*.fastq" + gz + " --outdir=" + out_dir + "/" + i + " --nogroup --extract ") if pairedReads_temp: os.system("fastqc " + in_dir + + i + "/" + i + "*_R2*.fastq" + gz + " --outdir=" + out_dir + "/" + i + " --nogroup --extract")
path=os.getcwd() #Ncores ncores=int(args.ncores) # Read sample names text file sample_names_file=args.sample_names_file sampleNames = functions.read_sample_names(sample_names_file) # Set input and output directories if not 'rawReads/' in_dir=path + '/' + args.in_dir out_dir=path + '/' +args.out_dir out_dir_report=path + '/' + args.out_dir_report # Create out_dir_report functions.make_sure_path_exists(out_dir_report) # Detect if files are gz gz = functions.check_gz(in_dir) # Run fastqc Parallel(n_jobs=ncores)(delayed(qc_check)(i) for i in sampleNames) # Number of reads per sample os.system("Rscript bin/indexQC.R " + in_dir + " " + out_dir_report)
params_file = args.analysis_info_file path = functions.read_parameters_file(params_file)['Working directory'] refGenome = functions.read_parameters_file(params_file)['Reference Genome'] strand = functions.read_parameters_file(params_file)['strand'] strand_piccard, strand_htseq = functions.get_strand(strand) gtfFile = functions.read_parameters_file(params_file)['GTF File'] os.chdir(path) # Read sample names text file sampleNames = functions.read_sample_names() # Set input and output directories if not '/' in_dir = args.in_dir out_dir = args.out_dir functions.make_sure_path_exists(out_dir) mapping_summary_file = args.mapping_summary_file # Detect if files are gz gz = functions.check_gz(in_dir) # Count command Parallel(n_jobs=7)(delayed(counting)(i) for i in sampleNames) # QC os.system("Rscript /usr/local/bin/countsLog_rnaseq.R " + out_dir + ' ' + mapping_summary_file) os.system("Rscript /usr/local/bin/library_proportion.R " + out_dir + ' ' + out_dir + ' ' + gtfFile)
params_file=args.analysis_info_file path=functions.read_parameters_file(params_file)['Working directory'] refGenome=functions.read_parameters_file(params_file)['Reference Genome'] strand=functions.read_parameters_file(params_file)['strand'] strand_piccard, strand_htseq = functions.get_strand(strand) gtfFile=functions.read_parameters_file(params_file)['GTF File'] os.chdir(path) # Read sample names text file sampleNames = functions.read_sample_names() # Set input and output directories if not '/' in_dir=args.in_dir out_dir=args.out_dir functions.make_sure_path_exists(out_dir) mapping_summary_file=args.mapping_summary_file # Detect if files are gz gz = functions.check_gz(in_dir) # Count command Parallel(n_jobs=7)(delayed(counting)(i) for i in sampleNames) # QC os.system("Rscript /usr/local/bin/countsLog_rnaseq.R " + out_dir + ' ' + mapping_summary_file) os.system("Rscript /usr/local/bin/library_proportion.R " + out_dir + ' ' + out_dir + ' ' + gtfFile)
project_location=ai['project_location'] folders = os.listdir(project_location) readsFiles = [folders[i] for i, x in enumerate(folders) if re.findall('rawReads',x)] # print readsFiles # Collect fastq files analysis_info_file if args.in_dir == 'bcl2fastq_output': allFiles=functions.get_filepaths(ai['project_location'] + '/' + ai[args.in_dir]) fastq=[allFiles[y] for y, x in enumerate(allFiles) if re.findall("fastq.gz", x)] fastq=[fastq[y] for y,x in enumerate(fastq) if not re.findall('Undetermined', x)] elif args.in_dir != 'bcl2fastq_output': allFiles=os.listdir(args.in_dir) fastq=[allFiles[y] for y, x in enumerate(allFiles) if re.findall("fastq.gz", x)] fastq=[args.in_dir + x for x in fastq] # print fastq # Move reads if not readsFiles: functions.make_sure_path_exists('rawReads') sampleDir = [] for sample in sampleNames: reads = [fastq[i] for i,x in enumerate(fastq) if re.findall(sample,x)] if sample not in sampleDir: functions.make_sure_path_exists('rawReads/'+sample) for r in reads: os.system('mv ' + '"' + r + '"' + ' rawReads/' + sample) sampleDir.append(sample) else: print "rawReads/ already folder exists"
params_file = args.analysis_info_file path = functions.read_parameters_file(params_file)['Working directory'] os.chdir(path) # Read sample names text file sampleNames = functions.read_sample_names() # Set input and output directories if not 'rawReads/' in_dir = args.in_dir out_dir = args.out_dir out_dir_plots = args.out_dir_plots readType = args.readType suffix_name = args.suffix_name files = functions.get_filepaths(in_dir) files = [ files[y] for y, x in enumerate(files) if re.findall("fastqc_data.txt", x) ] Parallel(n_jobs=8)(delayed(tables)(i) for i in files) functions.make_sure_path_exists(out_dir_plots) Parallel(n_jobs=8)(delayed(plots)(i) for i in sampleNames) os.system('Rscript /usr/local/bin/fastqc_plots_all_part2.R ' + in_dir + ' ' + 'sample_names.txt' + ' ' + readType + ' ' + out_dir_plots + ' ' + suffix_name) #os.system('ls rawReads/*/*fastqc | grep -v trimmed | grep ":" | sed \'s/://g\' > sample_names2.txt') #os.system('fastqc_summary.py ./sample_names2.txt ./summary_fastqc.txt')
import pickle import logging from joblib import Parallel, delayed import multiprocessing import subprocess #sys.path.insert(0,'/usr/local/bin/') import functions import argparse __version__ = 'v01' # created on 17/08/2016 if __name__ == '__main__': """ This script creates a file which needs to be filled with information required for a methylSeq project. - It takes one argument, the 'outfile', which is the name of the output file. The default is 'analysis_info.txt'""" parser=argparse.ArgumentParser(prog='analysis_info.py', description='Creates analysis_info.txt') parser.add_argument('-v','--version',action='version',version='%(prog)s-'+__version__) parser.add_argument('--analysis_info_file', help='Text file with details of the analysis. Default=analysis_info.txt', default='analysis_info.txt') args=parser.parse_args() # Collect info from analysis_info_file ai=functions.read_analysis_info_file(args.analysis_info_file) functions.make_sure_path_exists(ai['project_location'] + '/' +ai['bcl2fastq_output']) os.system("bcl2fastq -R " + ai['run_folder'] + " -o " + ai['project_location'] + '/' + ai['bcl2fastq_output'] + " --no-lane-splitting --sample-sheet " + ai['run_samplesheet'] + '&>' + ai['project_location'] + '/bcl_log.txt')
# Set path of project location ai = functions.read_analysis_info_file(args.analysis_info_file) path = ai['project_location'] os.chdir(path) #Ncores ncores = int(ai['ncores']) # Read sample names text file sample_names_file = args.sample_names_file sample_names_file = path + '/' + sample_names_file sampleNames = functions.read_sample_names(sample_names_file) # Set input and output directories if not 'rawReads/' in_dir = path + '/' + args.in_dir out_dir = path + '/' + args.out_dir out_dir_report = path + '/' + args.out_dir_report # Create out_dir_report functions.make_sure_path_exists(out_dir_report) # Detect if files are gz gz = functions.check_gz(in_dir) # Run fastqc Parallel(n_jobs=ncores)(delayed(qc_check)(i) for i in sampleNames) # Number of reads per sample os.system("/usr/bin/Rscript " + path + "/bin/indexQC.R " + in_dir + " " + out_dir_report)
args=parser.parse_args() params_file=args.analysis_info_file path=functions.read_parameters_file(params_file)['Working directory'] os.chdir(path) # Read sample names text file sampleNames = functions.read_sample_names() # Set input and output directories if not 'rawReads/' in_dir=args.in_dir out_dir=args.out_dir out_dir_plots=args.out_dir_plots readType=args.readType suffix_name=args.suffix_name files=functions.get_filepaths(in_dir) files = [files[y] for y, x in enumerate(files) if re.findall("fastqc_data.txt", x)] Parallel(n_jobs=8)(delayed(tables)(i) for i in files) functions.make_sure_path_exists(out_dir_plots) Parallel(n_jobs=8)(delayed(plots)(i) for i in sampleNames) os.system('Rscript /usr/local/bin/fastqc_plots_all_part2.R ' + in_dir + ' ' + 'sample_names.txt' + ' ' + readType + ' ' + out_dir_plots + ' ' + suffix_name ) #os.system('ls rawReads/*/*fastqc | grep -v trimmed | grep ":" | sed \'s/://g\' > sample_names2.txt') #os.system('fastqc_summary.py ./sample_names2.txt ./summary_fastqc.txt')