def summariseGTFs(outfile): job_memory = str(PARAMS["Merge_memory"]) + "G" statement = "python {}scripts/annotationSummaryTable.py --gtf-dir combined_annotations.dir --annot-output {} --orf-output {}".format( os.path.dirname(__file__).rstrip("pipelines"), outfile, "report.dir/orf_summary.tsv") P.run(statement)
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run cgatreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join(targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"]) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", iotools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = iotools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info( 'the report is available at %s' % os.path.abspath(os.path.join(params['report_html'], "contents.html")))
def runGOFromFiles(outfile, outdir, fg_file, bg_file=None, go_file=None, ontology_file=None, samples=None, minimum_counts=0, pairs=False, gene2name=None): """check for GO enrichment. The gene lists are supplied by files. This method is a wrapper for `runGO.py`. Arguments --------- outfile : string Output filename outdir : string Output directory for auxiliary files fg_file : string Gene list of foreground. bg_file : string Gene list for background. If None, all genes with GO annotations are used as background. go_file : string Filename with Gene-to-GO assignments ontology_file : string Filename with ontology information. samples : int Number of samples for empirical FDR. If not given, use BH FDR. minimum_counts : int Minimum number of observations in a GO category required in order for using it. pairs : bool If True, each category for each pair of gene sets will be tested for differential enrichment. gene2name : string Filename with a gene-to-genename information. """ if ontology_file is None: ontology_file = PARAMS.get("go_ontology", None) options = [] if ontology_file: options.append("--filename-ontology=%(ontology_file)s" % locals()) if bg_file is not None: options.append("--background-tsv-file=%(bg_file)s" % locals()) if samples is not None: options.append("--fdr") options.append("--sample-size=%(samples)i" % locals()) options.append("--fdr-method=empirical") else: options.append("--fdr") options.append("--fdr-method=BH") if pairs: options.append("--pairwise") if gene2name: options.append("--gene2name-map-tsv-file=%s" % gene2name) options = " ".join(options) statement = ''' cgat runGO --filename-input=%(go_file)s --genes-tsv-file=%(fg_file)s --output-filename-pattern='%(outdir)s/%%(set)s.%%(go)s.%%(section)s' --min-counts=%(minimum_counts)i --log=%(outfile)s.log %(options)s > %(outfile)s''' P.run(statement)
def cellrangerCount(infile, outfile): ''' Execute the cell ranger pipleline for all samples. ''' # set key parameters transcriptome = PARAMS["cellranger_transcriptome"] if transcriptome is None: raise ValueError('"cellranger_transcriptome" parameter not set' ' in file "pipeline.yml"') if not os.path.exists(transcriptome): raise ValueError('The specified "cellranger_transcriptome"' ' file does not exist') # set the maximum number of jobs for cellranger max_jobs = PARAMS["cellranger_maxjobs"] # parse the sample name and expected cell number library_id, cellnumber, batch, trash = os.path.basename(infile).split(".") # build lists of the sample files seq_folders = [] sample_ids = [] # Parse the list of sequencing runs (i.e., paths) for the sample with open(infile, "r") as sample_list: for line in sample_list: seq_folder_path = line.strip() if seq_folder_path != "": seq_folders.append(seq_folder_path) sample_ids.append(os.path.basename(seq_folder_path)) input_fastqs = ",".join(seq_folders) input_samples = ",".join(sample_ids) id_tag = library_id + "-count" log_file = id_tag + ".log" ## send one job script to slurm queue which arranges cellranger run ## hard-coded to ensure enough resources job_threads = 6 job_memory = "24000M" statement = ( '''cellranger count --id %(id_tag)s --fastqs %(input_fastqs)s --sample %(input_samples)s --transcriptome %(transcriptome)s --expect-cells %(cellnumber)s --chemistry %(cellranger_chemistry)s --jobmode=slurm --maxjobs=%(max_jobs)s --nopreflight &> %(log_file)s ''') P.run(statement) IOTools.touch_file(outfile)
def subsetAndDownsample(infiles, outfile): ''' Generate datasets that include subsets of the 10x samples. Optionally downsample UMI counts to normalise between samples. ''' outdir = os.path.dirname(outfile) if not os.path.exists(outdir): os.mkdir(outdir) agg_matrix_dir = os.path.join(os.path.dirname(infiles[0]), "agg.processed.dir") sample_table = pd.read_csv(infiles[1], sep="\t") subsets = [k.split("_", 1)[1] for k in PARAMS.keys() if k.startswith("datasets_")] # Titles of fields encoded in filenames name_field_titles = PARAMS["name_field_titles"] if PARAMS["downsampling_enabled"]: downsampling_function = PARAMS['downsampling_function'] else: downsampling_function = "no" downsampling_apply = PARAMS["downsampling_apply"] job_memory = PARAMS["postprocess_memory"] statements = [] for subset in subsets: if subset == "all": if not PARAMS["datasets_all"]: continue sample_ids = set(sample_table["sample_id"].values) sample_ids_str = ",".join(sample_ids) else: sample_ids = PARAMS["datasets" + "_" + subset] sample_ids_str = ",".join([x.strip() for x in sample_ids.split(",")]) out_dir = os.path.join(os.path.dirname(outfile), subset) tenx_dir = PARAMS["tenx_dir"] log_file = outfile.replace(".sentinel", "." + subset + ".log") statement = '''Rscript %(tenx_dir)s/R/cellranger_subsetAndDownsample.R --tenxdir=%(agg_matrix_dir)s --sampleids=%(sample_ids_str)s --downsample=%(downsampling_function)s --apply=%(downsampling_apply)s --samplenamefields=%(name_field_titles)s --outdir=%(out_dir)s &> %(log_file)s ''' % locals() statements.append(statement) P.run(statements) IOTools.touch_file(outfile)
def build_report(): scriptloc = "/".join(os.path.dirname( sys.argv[0]).split("/")[0:-1]) + "/scripts/assembly_report.Rmd" statement = 'R -e "rmarkdown::render(\'{}\',output_file=\'{}/report.dir/assembly_report.html\')" --args {}/contigs.dir/Contigs.Summary'.format( scriptloc, os.getcwd(), os.getcwd()) P.run(statement)
def run(self, infile, outfile, params): options = [] reference_fasta = params.reference_fasta reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) reference_label = None use_target_regions = True if params.reference_fasta: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) if params.reference_fasta == "auto": fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") reference_fasta, diffs = get_reference_for_bam( infile, fastafiles=fasta) if reference_fasta: options.append("--ref-seq {}".format(reference_fasta)) reference_label = map_path2name[reference_fasta] elif diffs: E.warn( "attempted to detect reference fasta, but unable to do so. " "diffs: {}".format(diffs)) else: E.warn("sequence dict is empty, BAM likely to be empty. " "target_regions will be ignored") use_target_regions = False else: options.append("--ref-seq {}".format(params.reference_fasta)) reference_label = map_path2name.get(params.reference_fasta, None) if params.target_regions and use_target_regions: target_regions = get_associated_file(params, reference_label, "target_regions") # convert to 1-based coordinates and decompress if target_regions.endswith(".bed.gz"): target_regions = ( "<(zcat {} " "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')" .format(target_regions)) options.append("--target-regions {}".format(target_regions)) options = " ".join(options) if not os.path.exists(outfile + ".tmp"): try: retval = P.run("{params.path} stats " "{self.options} " "{options} " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals()), job_memory="16G") except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) return None else: retval = None def split_output(lines): is_comment = True section, body = None, [] for line in lines: if line.startswith("#"): if body: yield section, body body = [] is_comment = True else: # the following preserves new-line line = re.sub("\t#.*", "", line) fields = line[:-1].split("\t") section = fields[0] body.append(fields[1:]) is_comment = False if body: yield section, body # split into separate files for upload with IOTools.open_file(outfile + ".tmp") as inf: for section, body in split_output(inf): try: tablename, columns = self._map_section_to_table[section] except KeyError: continue output_file = self.map_table_to_file(tablename, outfile) with IOTools.open_file(output_file, "w") as outf: if len(columns) > 1 and columns[1].startswith("VAR_"): outf.write("{}\t{}\n".format(columns[0], columns[1][4:])) for data in body: outf.write("{}\t{}\n".format( data[0], ",".join(data))) else: outf.write("\t".join(columns) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") os.rename(outfile + ".tmp", outfile) return retval
def mergeTables(outfile): statement=PipelineHumann2.humann2Merge(outfile,PARAMS) P.run(statement)
def normTables(infile,outfile): if re.search("coverage",infile): statement="ln -s ../{} {}".format(infile,outfile) else: statement=PipelineHumann2.humann2Norm(infile,outfile,PARAMS) P.run(statement)
def flagstat(infile, outfile): statement = '''samtools flagstat %(infile)s > %(outfile)s''' P.run(statement)
def runHumann2(infile, outfile): job_threads = int(PARAMS["Humann2_threads"]) job_memory = PARAMS["Humann2_memory"]+"G" statement = PipelineHumann2.humann2Call(infile,PARAMS) P.run(statement)
def idxstats(infile, outfile): statement = '''samtools idxstats %(infile)s > %(outfile)s''' P.run(statement)
def multiqc(infiles, outfile): statement = ''' export LC_ALL=en_US.UTF-8 && export LANG=en_US.UTF-8 && multiqc . -f -n %(outfile)s ''' P.run(statement)
def fastqc(infile, outfile): statement = '''fastqc %(infile)s > %(outfile)s.log''' P.run(statement)
def mapReads(infiles, outfile): '''Map reads to the genome using BWA ''' job_threads = PARAMS["bwa_threads"] m = PipelineMapping.BWA() statement = m.build((infiles, ), outfile) P.run()
def picard_rmdup(infile, outfile): final_memory = str(int(params['picard_memory']) + 2) + 'g' statement = """picard -Xmx%(picard_memory)sg MarkDuplicates REMOVE_DUPLICATES = true I= %(infile)s O=%(outfile)s M= %(outfile)s.metrics && samtools index %(outfile)s""" P.run(statement, job_queue=params['q'], job_memory=final_memory)
def summariseContigs(infile, outfile): #summarise each contigs file statement = PipelineAssembly.SummariseContigs(infile, outfile) P.run(statement)
def filter_read(infile, outfile): statement = """samtools view -h -b -f1 -f2 -F 4 -F 0x100%(infile)s chr{1..19} > %(outfile)s && samtools index %(outfile)s""" P.run(statement, job_queue=params['q'])
def indexBams(infile, outfile): '''index merged bams''' statement = f'''samtools index -b {infile} {outfile}''' P.run(statement)
def macs2(infiles, outfile): treatment, control = infiles name = outfile.replace('_peaks.narrowPeak', "").replace('peaks/', '') statement = """macs2 callpeak -t %(treatment)s -c %(control)s -n %(name)s -f BAM -g %(macs2_genome)s --outdir peaks""" P.run(statement, job_queue=params["q"])
def run(self, infile, outfile, params): # TODO: bam_fastqc_sequence_length_distribution.tsv may # contain ranges such as '30-31'. Convert to beginning of # range like in this perl command: # # perl -p -i -e "s/\-\d+//" # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv if infile.endswith(".gz"): prefix = IOTools.snip(os.path.basename(infile[:-3])) else: prefix = IOTools.snip(os.path.basename(infile)) outdir = os.path.dirname(outfile) datafile = os.path.join(outdir, "{}_fastqc".format(prefix), "fastqc_data.txt") if not os.path.exists(datafile): if not os.path.exists(outdir): os.makedirs(outdir) retval = P.run( "{params.path} " "{params.options} " "--extract " "--outdir {outdir} " "{infile} " ">& {outfile} ".format(**locals()), **params._asdict()) else: IOTools.touch_file(outfile) retval = None def _split_output(lines): body, header, section, status = [], None, None, None for line in lines: if line.startswith("##FastQC"): continue elif line.startswith("#"): header, body = line[1:-1].split("\t"), [] elif line.startswith(">>END_MODULE"): yield section, header, body, status body, header, section, status = [], None, None, None elif line.startswith(">>"): section, status = line[2:-1].split("\t") else: fields = line[:-1].split("\t") body.append(fields) # split into separate files for upload summary_data = [] with IOTools.open_file(datafile) as inf: for section, header, body, status in _split_output(inf): if len(body) == 0: continue summary_data.append((section, status)) tablename = "{}_".format(self.name) + re.sub( " ", "_", section).lower() if tablename not in self.tablenames: raise ValueError( "unknown tablename {}, expected one of {}".format( tablename, self.tablenames)) output_file = ".".join((outfile, tablename, "tsv")) with open(output_file, "w") as outf: outf.write("\t".join([x.lower() for x in header]) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") output_file = ".".join( (outfile, "{}_summary".format(self.name), "tsv")) with IOTools.open_file(output_file, "w") as outf: outf.write("section\tstatus\n") for section, status in summary_data: outf.write("{}\t{}\n".format(section, status)) return retval
def fastqc(infile, outfile): statement = "fastqc --nogroup -o fastqc %(infile)s > %(outfile)s.log" P.run(statement, job_queue=params["q"])
def postprocessAggrMatrix(infiles, outfile): ''' Post-process the cellranger aggr count matrix. Batch, sample_name and aggregation ID metadata are added. Optionally cells with barcodes shared (within sequencing batch) between samples can be removed (known index hopping on Illumina 4000). ''' outdir = os.path.dirname(outfile) if not os.path.exists(outdir): os.mkdir(outdir) infile = infiles[0] sample_table = infiles[1] agg_dir = os.path.dirname(infile) out_dir = os.path.dirname(outfile) # Clean barcode hopping if PARAMS["postprocess_barcodes"]: hopping = "--hopping" else: hopping = "" # Additional options options = PARAMS["postprocess_options"] mexdir = PARAMS["postprocess_mexdir"] if mexdir is None: raise ValueError('"postprocess_mexdir" parameter not set' ' in file "pipeline.yml"') tenxdir = os.path.join(agg_dir, mexdir) if not os.path.exists(tenxdir): raise ValueError('The specified "postprocess_mexdir"' ' directory does not exist in directory ' + agg_dir) job_memory = PARAMS["postprocess_memory"] blacklist = PARAMS["postprocess_blacklist"] log_file = outfile.replace(".sentinel", ".log") statement = '''Rscript %(tenx_dir)s/R/cellranger_postprocessAggrMatrix.R --tenxdir=%(tenxdir)s --sampletable=%(sample_table)s --samplenamefields=%(name_field_titles)s --downsample=no %(hopping)s --blacklist=%(blacklist)s %(options)s --outdir=%(out_dir)s &> %(log_file)s ''' P.run(statement) IOTools.touch_file(outfile)
def multiqc(infiles, outfile): statement = """export LC_ALL=en_US.UTF-8; export LANG=en_US.UTF-8; multiqc fastqc/ -f -n %(outfile)s""" P.run(statement, job_queue=params["q"])
def combine_means(infiles, outfile): infiles = " ".join(infiles) statement = ("cat %(infiles)s " "> %(outfile)s ".format(**locals())) P.run(statement)
def picard(infile, outfile): final_memory = str(int(params['picard_memory']) + 2) + 'g' statement = """picard -Xmx%(picard_memory)sg CollectAlignmentSummaryMetrics R=%(picard_genome)s I= %(infile)s O=%(outfile)s""" P.run(statement, job_queue=params['q'], job_memory=final_memory)
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = P.get_params()["trimmomatic_options"] if P.get_params()["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options elif P.get_params()["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( P.get_params()["trimmomatic_adapter"], P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = P.get_params()["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = preprocess.MasterProcessor( save=P.get_params()["save"], summarize=P.get_params()["summarize"], threads=P.get_params()["threads"], qual_format=P.get_params()['qual_format']) for tool in P.as_list(P.get_params()["preprocessors"]): if tool == "fastx_trimmer": m.add(preprocess.FastxTrimmer( P.get_params()["fastx_trimmer_options"], threads=P.get_params()["threads"])) elif tool == "trimmomatic": m.add(preprocess.Trimmomatic( trimmomatic_options, threads=P.get_params()["threads"])) elif tool == "sickle": m.add(preprocess.Sickle( P.get_params()["sickle_options"], threads=P.get_params()["threads"])) elif tool == "trimgalore": m.add(preprocess.Trimgalore( P.get_params()["trimgalore_options"], threads=P.get_params()["threads"])) elif tool == "flash": m.add(preprocess.Flash( P.get_params()["flash_options"], threads=P.get_params()["threads"])) elif tool == "reversecomplement": m.add(preprocess.ReverseComplement( P.get_params()["reversecomplement_options"])) elif tool == "pandaseq": m.add(preprocess.Pandaseq( P.get_params()["pandaseq_options"], threads=P.get_params()["threads"])) elif tool == "cutadapt": cutadapt_options = P.get_params()["cutadapt_options"] if P.get_params()["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add(preprocess.Cutadapt( cutadapt_options, threads=P.get_params()["threads"], untrimmed=P.get_params()['cutadapt_reroute_untrimmed'], process_paired=P.get_params()["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run(statement)
def flagstats(infile, outfile): statement = """samtools flagstats %(infile)s > %(outfile)s""" P.run(statement, job_queue=params['q'])
def createGOFromGeneOntology(infile, outfile): """get GO assignments from Geneontology.org GO terms are mapped to ensembl gene names via uniprot identifiers. Configuration ------------- geneontology_file Filename on geneontology database, e.g., gene_association.goa_human.gz database_name Pipeline database name Arguments --------- infile : string Unused outfile : string Output filename """ filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz") if not os.path.exists(filename): statement = ''' wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD ''' P.run(statement) # see http://www.geneontology.org/gene-associations/readme/goa.README Data = collections.namedtuple( "Data", "db db_object_id db_object_symbol qualifier goid dbreference evidence " " with_id aspect " " db_object_name synonym db_object_type " " taxon_id date assigned_by " " annotation_extension" " gene_product_form_id") dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() map_uniprot2ensembl = dict( cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info"). fetchall()) map_goid2description = dict( cc.execute("SELECT DISTINCT go_id, description FROM go_assignments"). fetchall()) aspect2name = { "P": "biol_process", "F": "mol_function", "C": "cell_location" } c = E.Counter() found_uniprot, found_genes, notfound_uniprot = set(), set(), set() outf = iotools.open_file(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for line in iotools.open_file(filename): if line.startswith("!"): continue c.input += 1 data = Data._make(line[:-1].split("\t")) if data.db_object_symbol in map_uniprot2ensembl: gene_id = map_uniprot2ensembl[data.db_object_symbol] found_uniprot.add(data.db_object_symbol) found_genes.add(gene_id) outf.write( "%s\t%s\t%s\t%s\t%s\n" % (aspect2name[data.aspect], gene_id, data.goid, map_goid2description.get(data.goid, ""), data.evidence)) c.output += 1 else: c.notfound += 1 notfound_uniprot.add(data.db_object_symbol) c.found_genes = len(found_genes) c.found_uniprot = len(found_uniprot) c.notfound_uniprot = len(notfound_uniprot) E.info("%s" % str(c)) E.info("not found=%s" % str(notfound_uniprot)) outf.close()
def meganAnnot(infile, outfile): job_memory = str(PARAMS["Blast2lca_memory"]) + "G" job_threads = int(PARAMS["Blast2lca_threads"]) #generate call to blast2lca statement = PipelineAnnotate.runBlast2Lca(infile, outfile, PARAMS) P.run(statement)