"pipeline_annotations.py", on_error_raise=__name__ == "__main__") # link up with ancestral repeats PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"], "pipeline_ancestral_repeats.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz")) TRACKS_CONTROL = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory( ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz") TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( ("merged.gtf.gz", ), "(\S+).gtf.gz") TRACKS_GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ), "(\S+).gtf.gz") # collection of all tracks including controls TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__") ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.bed.gz"), "(\S+).bed.gz") TRACKS_BEDFILES = ["%s.bed.gz" % x for x in TRACKS] ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") execfile("pipeline_conf.py") ################################################################### ################################################################### ################################################################### #
import CGAT.Pipeline as P P.getParameters(["%s.ini" % os.path.splitext(__file__)[0], "pipeline.ini"]) PARAMS = P.PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 #TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x], "(\S+).export.txt.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x], "(\S+).sra" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.1.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "")
import CGAT.Pipeline as Pipeline PARAMS_PIPELINE = Pipeline.peekParameters( ".", "pipeline_chipseq.py" ) import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz" ] TRACKS = sum( itertools.chain( [ PipelineTracks.Tracks( Sample ).loadFromDirectory( [ x for x in glob.glob( "%s/*.%s" % (DATADIR, s) ) if "input" not in x ], "%s/(\S+).%s" % (DATADIR, s) ) for s in suffixes ] ), PipelineTracks.Tracks( Sample ) ) Sample.setDefault( "asTable" ) ALL = PipelineTracks.Aggregate( TRACKS ) EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) ) CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) ) TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) ) ############################################################################ # The folllowing need to be parameterized in a config file # TISSUES=["GM00855", "GM00861" ] # CONDITIONS=["D3", "unstim" ] # REPLICATES=["R1", "R2" ]
################################################################### ################################################################### ## parameterization EXPORTDIR = P.get('mapping_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('mapping_datadir', P.get('datadir', '.')) DATABASE = P.get('mapping_backend', P.get('sql_backend', 'sqlite:///./csvdb')) ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.sra" % DATADIR), "%s/(\S+).sra" % DATADIR) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.fastq.gz" % DATADIR), "%s/(\S+).fastq.gz" % DATADIR ) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.fastq.1.gz" % DATADIR), "%s/(\S+).fastq.1.gz" % DATADIR ) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "*.csfasta.gz" ), "(\S+).csfasta.gz" ) ########################################################################### ## tracks for the gene sets class GenesetTrack(PipelineTracks.Sample): attributes = ("geneset", ) GENESET_TRACKS = PipelineTracks.Tracks(GenesetTrack).loadFromDirectory(
################################################################### ################################################################### ## parameterization EXPORTDIR=P['rnaseqtranscripts_exportdir'] DATADIR=P['rnaseqtranscripts_datadir'] DATABASE=P['rnaseqtranscripts_backend'] ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "%s/*.bam" % DATADIR), "%s/(\S+).bam" % DATADIR) ALL = PipelineTracks.Aggregate( TRACKS ) EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) ) CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) ) TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) ) GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "*.gtf.gz" ), "(\S+).gtf.gz" ) ########################################################################### CUFFDIFF_LEVELS= ("gene", "isoform", "cds", "tss") ########################################################################### ## shorthand
# parameterization EXPORTDIR = P.get('rnaseqdiffexpression_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('rnaseqdiffexpression_datadir', P.get('datadir', '.')) DATABASE = P.get('rnaseqdiffexpression_backend', P.get('sql_backend', 'sqlite:///./csvdb')) DATABASE_ANNOTATIONS = P['annotations_database'] ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.bam" % DATADIR), "(\S+).bam") ALL = PipelineTracks.Aggregate(TRACKS) EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") DESIGNS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("design*.tsv"), "(\S+).tsv") METHODS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*_stats.tsv"), "(\S+)_stats.tsv")
################################################### # Pipeline configuration # load options from the config file from CGATCore import Pipeline as P P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS ################################################################### # Helper functions mapping tracks to conditions, etc GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3) TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam") REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. ''' dbh = sqlite3.connect(PARAMS["database_name"])
################################################################### ################################################################### ################################################################### ## ################################################################### if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) PARAMS = P.getParameters() ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", )) ##################################################################### ##################################################################### ##################################################################### @transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz') def convertGtf2Psl(infile, outfile): """convert a gtf to a psl file. This method only takes features of type 'exon' and skips all contigs that are not in the genome sequence (for example the variant human chromosomes). """
"*.sra", "*.export.txt.gz", "*.csfasta.gz", "*.csfasta.F3.gz", ) SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name) for suffix_name in SEQUENCESUFFIXES]) SEQUENCEFILES_REGEX = regex( r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)") Sample = PipelineTracks.AutoSample Sample.attributes = ('tissue', 'condition', 'replicate') TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [y for x in SEQUENCESUFFIXES for y in glob.glob(x)], "(\S+).(fastq.1.gz|fastq.gz|sra)") EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) ######################################################################### # summarise read 3' ######################################################################### @follows(mkdir("sequence_characteristics.dir")) @transform(SEQUENCEFILES, SEQUENCEFILES_REGEX, r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv")
################################################### # load options from the config file import CGAT.Pipeline as P P.getParameters("pipeline.ini") PARAMS = P.PARAMS ################################################################### ################################################################### ## Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # collect fastq.gz tracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" ) ALL = PipelineTracks.Sample3() EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) ################################################################### ## Global flags ################################################################### ASSEMBLERS = P.asList(PARAMS["general_assemblers"]) METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS ASSEMBLERS = P.asList(PARAMS["assemblers"])
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") ########################################################################## ########################################################################## # Helper functions mapping tracks to conditions, etc ########################################################################## import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.AutoSample # define tracks based on all samples in .bamfile that are not input or index TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob(os.path.join(PARAMS["location_bamfiles"], "*.bam")), "(\S+).bam", exclude=[".+input.+"]) @files(None, None) def printTracks(infile, outfile): P.warn("\n\n\n\nprinting tracks:") for track in EXPERIMENTS: print "\t" print track def get_peak_caller_parameters(peak_caller_id): """ Returns a dictionary of config file parameters for the chosen peak caller (an attempt to keep access to PARAMS out of associated pipeline script).
]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.AutoSample # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.bam"), "(\S+).bam") # group by experiment (assume that last field is a replicate identifier) EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) GENESETS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") ################################################################### ################################################################### ################################################################### def connect(): '''connect to database.
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") PipelineiCLIP.PARAMS = PARAMS PipelineiCLIP.PARAMS_ANNOTATIONS = PARAMS_ANNOTATIONS PARAMS["project_src"] = os.path.join(os.path.dirname(__file__), "..") ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # define some tracks if needed TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3) for line in IOTools.openFile("sample_table.tsv"): track = line.split("\t")[2] TRACKS.tracks.append(PipelineTracks.Sample3(filename=track)) ################################################################### def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. ''' dbh = sqlite3.connect(PARAMS["database"])
# add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update(P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__", prefix="annotations_", update_interface=True)) # define some tracks if needed TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.ini"), "(\S+).ini") # --------------------------< utility functions >---------------------------- # def connect(): '''Connect to database. Use this method to connect to additional databases. Returns an sqlite3 database handle. ''' dbh = sqlite3.connect(PARAMS["database"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement)
PipelineMedip.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"] TRACKS = sum( itertools.chain([ PipelineTracks.Tracks(Sample).loadFromDirectory([ x for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x ], "(\S+).%s" % s) for s in suffixes ]), PipelineTracks.Tracks(Sample)) ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) ################################################################### ################################################################### ################################################################### # define aggregates
from CGATCore import Pipeline as P import CGATPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters([ "%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("medip_*"), "medip_(\S+)") def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
PipelineMotifs.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks # determine the location of the input files (reads). DATADIR = PARAMS.get('input', '.') if not os.path.exists(DATADIR): raise OSError('data directory %s does not exists') Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob(os.path.join(DATADIR, "*.bed.gz")), "(\S+).bed.gz") BEDFILES = [os.path.join(DATADIR, "%s.bed.gz") % x for x in TRACKS] # create an indicator target @transform(BEDFILES, suffix(".gz"), ".gz") def BedFiles(infile, outfile): pass BAMFILES = glob.glob(os.path.join(DATADIR, "*.bam")) def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track.
################################################################### ################################################################### # parameterization EXPORTDIR = P.get('readqc_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('readqc_datadir', P.get('datadir', '.')) DATABASE = P.get('readqc_backend', P.get('sql_backend', 'sqlite:///./csvdb')) ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.sra" % DATADIR), "(\S+).sra") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.fastq.gz" % DATADIR), "(\S+).fastq.gz") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.fastq.1.gz" % DATADIR), "(\S+).fastq.1.gz") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.csfasta.gz"), "(\S+).csfasta.gz") ########################################################################### class ReadqcTracker(TrackerSQL): '''Define convenience tracks for plots''' def __init__(self, *args, **kwargs): TrackerSQL.__init__(self, *args, backend=DATABASE, **kwargs)
dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close() return dbh class MySample(PipelineTracks.Sample): attributes = tuple(PARAMS["attributes"].split(",")) TRACKS = PipelineTracks.Tracks(MySample).loadFromDirectory( glob.glob("*.bam"), "(\S+).bam") Sample = PipelineTracks.AutoSample DESIGNS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.design.tsv"), "(\S+).design.tsv") ################################################################### ################################################################### ################################################################### # DEXSeq workflow ################################################################### @mkdir("results.dir") @files(PARAMS["annotations_interface_geneset_all_gtf"], "geneset_flat.gff") def buildGff(infile, outfile):