def mergeAndLoadTrackComparisons(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=(".+/(.+)_to_(.+).tomtom"), cat="track1,track2", options="-i track1 -i track2")
def loadFractionReadsSpliced(infiles, outfile): '''load to fractions of spliced reads to a single db table''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).fraction.spliced", cat="sample_id", options='-i "sample_id"')
def loadGeneProfiles(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename='.+/(.+)\-(.+)\-(.+)\.(.+).tsv.gz', cat="factor,condition,rep,interval", options="-i factor -i condition -i rep -i interval")
def loadDistances(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+)_vs_(.+).tsv.gz", cat="File1,File2", options="-i Track, -i File2")
def loadAlignmentSummaryMetrics(infiles, outfile): '''load the complexity metrics to a single table in the db''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).alignment.summary.metrics", cat="sample_id", options='-i "sample_id"')
def loadCounts(infiles, outfile): '''Merge feature counts data into one table''' P.concatenateAndLoad(infiles, outfile, regex_filename="(.+).tsv.gz", options="-i track,gene_id")
def loadClusterContextStats(infiles, outfile): P.concatenateAndLoad( infiles, outfile, regex_filename= "clusters.dir/(.+)(?:sig_bases|clusters).context_stats.tsv.gz")
def loadCollectRnaSeqMetrics(infiles, outfile): '''load the metrics to the db''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).rnaseq.metrics", cat="sample_id", options='-i "sample_id"')
def loadThreePrimeBias(infiles, outfile): '''load the metrics to the db''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).three.prime.bias", cat="sample_id", options='-i "sample_id"')
def loadCopyNumber(infiles, outfile): '''load the copy number estimations to the database''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/(.*).copynumber", options='-i "gene_id"', job_memory=PARAMS["sql_himem"])
def loadReproducibility(infiles, outfile): P.concatenateAndLoad(infiles, outfile, cat="Experiment", regex_filename=".+/(.+)-agg.reproducibility.tsv.gz", options="-i Track -i fold -i level")
def loadDedupedUMIStats(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).umi_stats.tsv.gz", cat="track", options="-i track -i UMI")
def load_dedup_kmers(infiles, outfile): P.concatenateAndLoad( infiles, outfile, regex_filename="kmers.dir/(.+)-(.+)-(.+)\.([0-9]+mers).tsv.gz", cat="factor,tag,replicate,k", options="-i factor -i tag -i replicate -i kmer")
def loadCrosslinkedBasesCount(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).count_bases", header="track,count", cat="track", has_titles=False)
def loadTranscriptClassification(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).class.gz", options="-i transcript_id -i gene_id" "-i match_gene_id -i match_transcript_id" "-i source --quick")
def merge_mismatch_counts(infiles, outfile): '''Load the results of mismatch counting into the database''' P.concatenateAndLoad(infiles, outfile, regex_filename="mismatches.dir/(CB|FC)-(.+).tsv.gz", cat="tissue,replicate", options="-i tissue -i replicate -i gene_id")
def loadSpikeVsGenome(infiles, outfile): '''Load number of reads uniquely mapping to genome & spike-ins and fraction of spike-ins to a single db table''' P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).uniq.mapped.reads", cat="sample_id", options='-i "sample_id"')
def load_sig_exon_counts(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename="dedup_(.+).dir/(.+).exon_sig_count.tsv.gz", cat="method,track", has_titles=False, header="method,track,name,count", options="-i method -i track")
def loadNspliced(infiles, outfile): P.concatenateAndLoad( infiles, outfile, regex_filename=".+/(.+).nspliced.txt", cat="track", has_titles=False, header="track,nspliced", )
def load_node_counts(infiles, outfile): '''Load the number of counts per cluster distribution - only output if method was cluster as it will be the same irrespective of the network method used''' P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+)_nodes.tsv", has_titles=False, header="track,category,count")
def loadClusterCounts(infiles, outfile): P.concatenateAndLoad( infiles, outfile, regex_filename=".+/(.+).(R[0-9]+|union|reproducible).cluster_count", header="sample,replicate,count", cat="sample,replicate", has_titles=False)
def loadFeatureCounts(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".*/(.*).counts.gz", has_titles=False, cat="track", header="track,gene_id,counts", options='-i "gene_id"', job_memory=PARAMS["sql_himem"])
def mergeAllQuants(infiles, outfile): job_memory = "6G" P.concatenateAndLoad( infiles, outfile, regex_filename="quantification.dir/(.*-.*-.*)_agg-agg-agg.sf", options="-i Name -i Length -i EffectiveLength" "-i TPM -i NumReads -i track" "-i source --quick")
def loadTranscriptProfiles(infiles, outfile): ''' concatenate and load the transcript profiles Retain sample name as column = "track''' regex = ("transcriptprofiles.dir/(\S+).transcriptprofile.gz." "geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz") infiles = [x + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" for x in infiles] P.concatenateAndLoad(infiles, outfile, regex_filename=regex)
def loadEstimateLibraryComplexity(infiles, outfile): '''load the complexity metrics to a single table in the db''' if PAIRED: P.concatenateAndLoad(infiles, outfile, regex_filename=".*/.*/(.*).library.complexity", cat="sample_id", options='-i "sample_id"') else: statement = '''echo "Not compatible with SE data" > %(outfile)s''' P.run()
def loadContextStats(infiles, outfile): if "saturation" in infiles[0]: regex_filename = ".+/(.+-.+-.+)\.([0-9]+\.[0-9]+).reference_context.tsv" cat = "track,subset" else: regex_filename = ".+/(.+).reference_context.tsv" cat = "track" P.concatenateAndLoad(infiles, outfile, regex_filename=regex_filename, cat=cat)
def loadCramQuality(infiles, outfile): ''' Load the quality scores for the different cells into the database (summarized table). ''' quality_files = [fn for filenames in infiles for fn in filenames if fn.endswith(".quality")] P.concatenateAndLoad(quality_files, outfile, regex_filename="validate.cram.dir/(.*).quality", cat="track", has_titles=False, header="cramID,number_reads,cram_quality_score")
def loadUtronIDs(infiles, outfile): header = "track,transcript_id" options = "-i track -i transcript_id" if not outfile == "all_utrons_ids.load": header += ",match_transcript_id" options += "-i match_transcript_id" P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+)\..+\.ids.gz", has_titles=False, cat="track", header=header, options=options)
def loadInsertSizeMetrics(infiles, outfile): '''load the insert size metrics to a single table''' if PAIRED: picard_summaries = [x[0] for x in infiles] P.concatenateAndLoad(picard_summaries, outfile, regex_filename=(".*/.*/(.*)" ".insert.size.metrics.summary"), cat="sample_id", options='') else: statement = '''echo "Not compatible with SE data" > %(outfile)s ''' P.run()
def loadInsertSizeHistograms(infiles, outfile): '''load the histograms to a single table''' if PAIRED: picard_histograms = [x[1] for x in infiles] P.concatenateAndLoad(picard_histograms, outfile, regex_filename=(".*/.*/(.*)" ".insert.size.metrics.histogram"), cat="sample_id", options='-i "insert_size" -e') else: statement = '''echo "Not compatible with SE data" > %(outfile)s ''' P.run()
def load_dexseq(infiles, outfile): statement = " checkpoint;".join([ " sed 's/log2fold_\S+/log2fold/' %s > %s.tmp;" % (f, f) for f in infiles ]) P.run() infiles = ["%s.tmp" % f for f in infiles] P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).dexseq.tsv.tmp", options="-i groupID -i featureID -i track -i padj", job_memory="6G") for f in infiles: os.unlink(f)
def loadAnomolies(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).anomolies.tsv.gz", options="-i track -i probe")
def loadInteractionCountMetrics(infiles, outfile): infiles = [re.sub(".tsv.gz",".metrics.tsv", infile) for infile in infiles] P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).metrics.tsv")
def loadInteractionCounts(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename=".+/(.+).tsv.gz", options = "-i track -i Frag1 -i Frag2")
def load_exon_counts(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename="dedup_(.+).dir/(.+).exon_count.tsv.gz", cat="method,track", options="-i method -i track -i gene_id")
def load_base_level_reproducibility(infiles, outfile): P.concatenateAndLoad(infiles, outfile, regex_filename="dedup_(.+).dir/.+.rep", cat="method", options="-i method")