def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[ 1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)): true_set.add(taxa[0]) for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp)) / total_estimate tp_rate = float(len(tp)) / total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def estimateCopyNumber(infiles, outfile, params): """Estimate copy number based on ERCC spike in concentrations. Expects the location of the directory containing the R code as a single parameter.""" infile, cuffnorm_load, ercc_load = infiles code_dir = params[0] cuffnorm_table = P.toTable(cuffnorm_load) ercc_table = P.toTable(ercc_load) track = outfile.split("/")[-1][: -len(".spike.norm")] plotname = outfile + ".png" # col_name = track.replace("-","_") + "_0" col_name = re.sub(r"[-.]", "_", track) + "_0" # ## connect to the database. con = sqlite3.connect(PARAMS["database_name"]) # ## retrieve the spike in data statement = ( """select e.gene_id, %(col_name)s as FPKM, copies_per_cell from %(ercc_table)s e inner join %(cuffnorm_table)s c on e.gene_id=c.tracking_id """ % locals() ) # spikedf = PU.fetch_DataFrame(statement, PARAMS["database"]) spikedf = pd.read_sql(statement, con) # rspikedf = pdcom.convert_to_r_dataframe(spikedf) # ## retrieve the data to normalise statement = ( """ select tracking_id as gene_id, %(col_name)s as FPKM from %(cuffnorm_table)s """ % locals() ) fpkms = pd.read_sql(statement, con) # rfpkms = pdcom.convert_to_r_dataframe(fpkms) script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) r = R.r rscript = os.path.join(os.path.join(code_dir, PARAMS["rsource"])) r.source(rscript) plotname, outfile = [os.path.abspath(x) for x in [plotname, outfile]] r.normalise_to_spikes(spikedf, fpkms, plotname, outfile, track)
def estimateCopyNumber(infiles, outfile, params): '''Estimate copy number based on ERCC spike in concentrations. Expects the location of the directory containing the R code as a single parameter.''' infile, cuffnorm_load, ercc_load = infiles code_dir = params[0] cuffnorm_table = P.toTable(cuffnorm_load) ercc_table = P.toTable(ercc_load) track = outfile.split("/")[-1][:-len(".spike.norm")] plotname = outfile + ".png" # col_name = track.replace("-","_") + "_0" col_name = re.sub(r"[-.]", "_", track) + "_0" # ## connect to the database. con = sqlite3.connect(PARAMS["database_name"]) # ## retrieve the spike in data statement = '''select e.gene_id, %(col_name)s as FPKM, copies_per_cell from %(ercc_table)s e inner join %(cuffnorm_table)s c on e.gene_id=c.tracking_id ''' % locals() # spikedf = PU.fetch_DataFrame(statement, PARAMS["database"]) spikedf = pd.read_sql(statement, con) # rspikedf = pdcom.convert_to_r_dataframe(spikedf) # ## retrieve the data to normalise statement = ''' select tracking_id as gene_id, %(col_name)s as FPKM from %(cuffnorm_table)s ''' % locals() fpkms = pd.read_sql(statement, con) # rfpkms = pdcom.convert_to_r_dataframe(fpkms) script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) r = R.r rscript = os.path.join(os.path.join(code_dir, PARAMS["rsource"])) r.source(rscript) plotname, outfile = [os.path.abspath(x) for x in [plotname, outfile]] r.normalise_to_spikes(spikedf, fpkms, plotname, outfile, track)
def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 outf.close()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadProteinStats(infile, outfile): '''load protein statistics to database. The *infile* is an ENSEMBL peptide file. Remove empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) ''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" |sed "s/^id/protein_id/" | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=protein_id --map=protein_id:str --table=%(table)s > %(outfile)s''' P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile): ''' loads the summary of reads contributing to transcripts ''' tablename = P.toTable(outfile.replace("/", "_")) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run()
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg({"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2tsv | %(load_statement)s > %(outfile)s''' P.run()
def loadPeptideSequences(infile, outfile): """load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. """ load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str") statement = """gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s""" P.run()
def loadmiRNATranscripts(infile, outfile): '''load transcripts from a GFF3 file into the database. Arguments --------- infile : string ENSEMBL geneset in :term:`gff3` format. outfile : string Logfile. The table name is derived from `outfile`. ''' job_memory = PARAMS["job_memory"] load_statement = P.build_load_statement( P.toTable(outfile), options="--allow-empty-file " "--header-names=feature,Name") statement = ''' export LANG=en_GB.UTF-8 && zcat %(infile)s | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null | grep -v "#" | cut -f3,12 |%(load_statement)s > %(outfile)s''' P.run()
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ job_memory = PARAMS["job_memory"] load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run()
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg( {"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [ x[:-len("_0")] if x.endswith("_0") else x for x in df.columns ] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg( {"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def loadTranscriptStats(infile, outfile): """compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str" ) statement = """ gunzip < %(infile)s |\ python %(scriptsdir)s/gtf2table.py \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s""" P.run()
def loadEditDistances(infile, outfile): '''Load distribtuions of edit distances as output by umi_tools dedup''' load_smt = P.build_load_statement( P.toTable(outfile), options="-i edit_distance") statement = ''' sed s/unique/_unique/g %(infile)s | %(load_smt)s > %(outfile)s ''' P.run()
def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [x[:-len("_0")] if x.endswith("_0") else x for x in df.columns] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg({"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open("tophat/tophat.dir/picard_align_stats.tsv", "w") first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s''' P.run()
def loadTranscriptInformation(infile, outfile, only_proteincoding=False): '''load transcript information from a gtf file. *infile* is an ENSEMBL gtf file. ''' table = P.toTable(outfile) if only_proteincoding: filter_cmd = """python %(scriptsdir)s/gtf2gtf.py --method=filter --filter-method=proteincoding""" % PARAMS else: filter_cmd = "cat" statement = '''zcat < %(infile)s | awk '$3 == "CDS"' | grep "transcript_id" | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene+transcript | python %(scriptsdir)s/gtf2tsv.py --attributes-as-columns --output-only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number | %(pipeline_scriptsdir)s/hsort 1 | uniq | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=transcript_id --add-index=gene_id --add-index=protein_id --add-index=gene_name --map=transcript_name:str --map=gene_name:str --table=%(table)s > %(outfile)s''' P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('tophat/tophat.dir/dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.stats") statfile = f lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadCountSingleAndMultiExonLincRNA(infile, outfile): ''' load the counts for the multi and single exon lincRNA ''' tablename = P.toTable(outfile.replace("/", "_")) + ".count" statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('tophat/tophat.dir/dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.stats") statfile = f lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile): ''' loads the summary of reads contributing to transcripts ''' tablename = P.toTable(outfile.replace("/", "_")) statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def createViewMapping(infile, outfile): """create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats """ tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = """ CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b """ Database.executewait(dbhandle, statement % locals())
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadTranscriptStats(infile, outfile): '''compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str") statement = ''' gunzip < %(infile)s |\ python %(scriptsdir)s/gtf2table.py \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2table.py --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s''' P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2table.py --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute( "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write( "".join(["promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc])) i += 1 cc.execute( "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write( "".join(["promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc])) i += 1 outf.close()
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def loadCountSingleAndMultiExonLincRNA(infile, outfile): ''' load the counts for the multi and single exon lincRNA ''' tablename = P.toTable(outfile.replace("/", "_")) + ".count" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadPeptideSequences(infile, outfile): '''load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str") statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s''' P.run()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing-value=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """python %(scriptsdir)s/combine_tables.py --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run()
def buildDMRStats(infiles, outfile): '''compute differential methylation stats.''' tablenames = [P.toTable(x) for x in infiles] method = P.snip(outfile, "_stats.tsv") PipelineMedip.buildDMRStats(tablenames, method, outfile, dbhandle=connect())
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing-value=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def loadCoveredCpGs(infile, outfile): dbh = connect() tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %%(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def load_chunk_annotations(infile, outfile): P.load(infile, outfile, "-i gene_id -i exon_id") tablename = P.toTable(outfile) connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint; CREATE INDEX %(tablename)s_joint ON %(tablename)s(gene_id,exon_id)''' % locals())
def loadCoveredCpGs(infile, outfile): dbh = connect() tablename = P.toTable(outfile) statement = '''cat %(infile)s | cgat csv2db --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()