def loadMutectExtendedOutput(infile, outfile): '''Load mutect extended output into database''' infile = infile.replace(".mutect.snp.vcf", "_call_stats.out") indices = "contig,position" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes, database, annotations_database): tmpfile = P.getTempFilename("/ifs/scratch/") table = os.path.basename(transcript_info) if gene_biotypes: where_cmd = "WHERE " + " OR ".join( ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")]) else: where_cmd = "" select = """SELECT DISTINCT transcript_id, transcript_biotype, gene_id, gene_name FROM annotations.%(table)s %(where_cmd)s""" % locals() df1 = pd.read_table(infile, sep="\t") df1.set_index("transcript_id", drop=True, inplace=True) df2 = pd.read_sql(select, connect(database, annotations_database)) df2.set_index("transcript_id", drop=False, inplace=True) df = df1.join(df2) df.to_csv(tmpfile, sep="\t", index=True) options = "--add-index=transcript_id" P.load(tmpfile, outfile, options=options) os.unlink(tmpfile)
def loadNCG(outfile): '''Load NCG into database''' infile = PARAMS["cancergenes_table"] # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv" P.load(infile, outfile, options="--add-index=symbol")
def loadExonValidation(infiles, outfile): ''' load individual and merged exon validation stats For each sample, the exon validation stats are loaded into a table named by sample and mapper [sample]_[mapper]_overrun The merge alignment stats for all samples are merged and loaded into single table called exon_validation Parameters ---------- infiles : list Input filenames with exon validation stats outfile : str Output filename ''' suffix = ".exon.validation.tsv.gz" P.mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database, annotations_database): tmpfile = P.getTempFilename("/ifs/scratch/") table = os.path.basename(gene_info) if gene_biotypes: where_cmd = "WHERE " + " OR ".join( ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")]) else: where_cmd = "" select = """SELECT DISTINCT gene_id, gene_name FROM annotations.%(table)s %(where_cmd)s""" % locals() df1 = pd.read_table(infile, sep="\t") df1.set_index("test_id", drop=False, inplace=True) df2 = pd.read_sql(select, connect(database, annotations_database)) df2.set_index("gene_id", drop=False, inplace=True) df = df1.join(df2) df.to_csv(tmpfile, sep="\t", index=True) options = "--add-index=gene_id" P.load(tmpfile, outfile, options=options) os.unlink(tmpfile)
def loadQcMeasures(infile, outfile): ''' load QC measures into CSVDB ''' P.load(infile, outfile, options="--add-index=track")
def loadSailfishCounts(infile, outfile): ''' load Sailfish gene counts data into CSVDB ''' P.load(infile, outfile)
def loadSampleInfo(infile, outfile): P.load( infile, outfile, options="--header-names=format,barcode,track,lanes -i barcode -i track" )
def loadSailfishCounts(infile, outfile): ''' load Sailfish gene counts data into CSVDB ''' P.load(infile, outfile)
def loadDistances(infile, outfile): '''load annotations''' P.load( infile, outfile, "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str" ) table = outfile[:-len(".load")]
def loadPicardCoverageStats(infiles, outfile): '''import coverage statistics into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadCuffNormClassic(infile, outfile): '''load the fpkm table from cuffnorm into the database''' fpkm_table = os.path.dirname(infile) + "/genes.fpkm_table" P.load(fpkm_table, outfile, options='-i "tracking_id"')
def loadGCContent(infile, outfile): ''' load the results the GC content for each background and foreground ''' P.load(infile, outfile, options="--add-index=id")
def loadAnnotations(infile, outfile): '''load variant annotations into database''' P.load(infile, outfile, options="--map=gene_id:str " "--add-index=gene_id " "--map=base_qualities:text ")
def loadSailfishTpm(infile, outfile): ''' load Sailfish TPM estimates into CSVDB ''' P.load(infile, outfile)
def loadSailfishTpm(infile, outfile): ''' load Sailfish TPM estimates into CSVDB ''' P.load(infile, outfile)
def loadNCG(outfile): '''Load NCG into database''' infile = PARAMS["cancergenes_table"] # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv" P.load(infile, outfile, options="--add-index=symbol")
def loadExonValidation(infiles, outfile): ''' load individual and merged exon validation stats For each sample, the exon validation stats are loaded into a table named by sample and mapper [sample]_[mapper]_overrun The merge alignment stats for all samples are merged and loaded into single table called exon_validation Parameters ---------- infiles : list Input filenames with exon validation stats outfile : str Output filename ''' suffix = ".exon.validation.tsv.gz" P.mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadMutectExtendedOutput(infile, outfile): '''Load mutect extended output into database''' infile = infile.replace(".mutect.snp.vcf", "_call_stats.out") indices = "contig,position" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadAnnotations(infile, outfile): '''load variant annotations into database''' P.load(infile, outfile, options="--map=gene_id:str " "--add-index=gene_id " "--map=base_qualities:text ")
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadExonValidation(infiles, outfile): """merge alignment stats into single tables.""" suffix = suffix = ".exon.validation.tsv.gz" mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadEnrichmentOfTFBS(infile, outfile): ''' load the results of the enrichment ''' P.load(infile, outfile, options="--add-index=matrix_id")
def loadExonValidation(infiles, outfile): '''merge alignment stats into single tables.''' suffix = suffix = ".exon.validation.tsv.gz" mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadMetaInformation(infile, outfile): P.load(infile, outfile, options="--map=id:int " "--map=sample_id:int " "--map=experiment_id:int " "--add-index=id " "--add-index=experiment_id " "--add-index=sample_id ")
def loadFimo(infile, outfile): P.load( infile, outfile, options= '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" ' )
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.openFile(infile, "r")) tmpfile = P.getTempFile() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs( match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.openFile(infile, "r")) tmpfile = P.getTempFile() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs( match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame( [['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.getTempFile(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str( infile ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def loadCPCResults(infile, outfile): ''' load the results of the cpc analysis ''' P.load(infile, outfile, options="--header-names=transcript_id,feature,C_NC,CP_score " "--add-index=transcript_id")
def load_chunk_annotations(infile, outfile): P.load(infile, outfile, "-i gene_id -i exon_id") tablename = P.toTable(outfile) connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint; CREATE INDEX %(tablename)s_joint ON %(tablename)s(gene_id,exon_id)''' % locals())
def loadVariantAnnotation(infile, outfile): '''Load VCF annotations into database''' if infile.endswith("indels.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" elif infile.endswith("mutect.snp.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadVariantAnnotation(infile, outfile): '''Load VCF annotations into database''' if infile.endswith("indels.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" elif infile.endswith("mutect.snp.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadCPCResults(infile, outfile): ''' load the results of the cpc analysis ''' P.load(infile, outfile, options="--header-names=transcript_id,feature,C_NC,CP_score " "--add-index=transcript_id")
def loadPolyphenMap(infile, outfile): '''load polyphen input data.''' P.load(infile + ".map", outfile, options="--add-index=snp_id " "--add-index=track,transcript_id " "--add-index=contig,pos " "--add-index=protein_id " "--add-index=transcript_id ")
def loadPolyphenMap(infile, outfile): '''load polyphen input data.''' P.load(infile + ".map", outfile, options="--add-index=snp_id " "--add-index=track,transcript_id " "--add-index=contig,pos " "--add-index=protein_id " "--add-index=transcript_id ")
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.getTempFile(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def build_db(infiles, outfile): ''' Stores data generated throughout pipeline as a sqlite database. Structure of data tables and database is meant for compatibility with the shiny app ''' # record merged_filter_summary, merged_qc_summary, # merged_taxonomy, merged_abundance_id # and yml table in database P.load(infiles, outfile)
def loadPermuteMATS(infile, outfile): '''load rMATS permutation results Loads rMATS permutation summary results into relational database. Parameters ---------- infile: file containing summary table of rMATS permutation results outfile: .load file ''' P.load(infile, outfile)
def loadCollateMATS(infile, outfile): '''load rMATS summary into relational database Loads rMATS summary results into relational database. Parameters ---------- infile: file containing summary table of rMATS results outfile: .load file ''' P.load(infile, outfile)
def loadPermuteMATS(infile, outfile): '''load rMATS permutation results Loads rMATS permutation summary results into relational database. Parameters ---------- infile: file containing summary table of rMATS permutation results outfile: .load file ''' P.load(infile, outfile)
def loadCollateMATS(infile, outfile): '''load rMATS summary into relational database Loads rMATS summary results into relational database. Parameters ---------- infile: file containing summary table of rMATS results outfile: .load file ''' P.load(infile, outfile)
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) outf = P.getTempFile(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.openFile(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadManualAnnotations(infile, outfile): tmp = P.getTempFilename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.openFile(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.openFile(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def loadManualAnnotations(infile, outfile): tmp = P.getTempFilename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.openFile(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.openFile(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def load_last_exon_chunks(infile, outfile): '''Load gene and exon_ids for last exons into database''' from CGAT import GTF with P.getTempFile(shared=True) as tmpfile: tmpfile.write("gene_id\tchunk_id\n") for exon in GTF.iterator(IOTools.openFile(infile)): tmpfile.write("\t".join( [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n") tmpfn = tmpfile.name P.load(tmpfn, outfile, options="-i gene_id -i exon_id") os.unlink(tmpfn)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) outf = P.getTempFile(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.openFile(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadMATS(infile, outfile): '''load RMATS results into relational database Loads rMATS results into relational database. Continues if table empty. Parameters ---------- infile: term:`tsv` file containing one type of rMATS results. outfile: .load file ''' try: P.load(infile, outfile) except: P.touch(outfile)
def loadClusterCounts(infiles, outfile): '''Find the number of signficant clusters found in each sample''' tmp = P.getTempFilename(shared=True) results = [] for infile in infiles: count = IOTools.getNumLines(infile) method, track = re.match( "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups() results.append((method, track, count)) IOTools.writeLines(tmp, results, header=["method", "track", "count"]) P.load(tmp, outfile) os.unlink(tmp)
def loadMATS(infile, outfile): '''load RMATS results into relational database Loads rMATS results into relational database. Continues if table empty. Parameters ---------- infile: term:`tsv` file containing one type of rMATS results. outfile: .load file ''' try: P.load(infile, outfile) except: P.touch(outfile)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)