def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | cgat gtf2table --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def loadPeptideSequences(infile, outfile): """load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. """ load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str") statement = """gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s""" P.run()
def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2table.py --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def loadPeptideSequences(infile, outfile): '''load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str") statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | cgat fasta2fasta --method=filter --filter-method=min-length=1 | cgat fasta2table --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s''' P.run()
def loadTranscriptStats(infile, outfile): """compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str" ) statement = """ gunzip < %(infile)s |\ python %(scriptsdir)s/gtf2table.py \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s""" P.run()
def loadEditDistances(infile, outfile): '''Load distribtuions of edit distances as output by umi_tools dedup''' load_smt = P.build_load_statement( P.toTable(outfile), options="-i edit_distance") statement = ''' sed s/unique/_unique/g %(infile)s | %(load_smt)s > %(outfile)s ''' P.run()
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ job_memory = PARAMS["job_memory"] load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run()
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s''' P.run()
def loadmiRNATranscripts(infile, outfile): '''load transcripts from a GFF3 file into the database. Arguments --------- infile : string ENSEMBL geneset in :term:`gff3` format. outfile : string Logfile. The table name is derived from `outfile`. ''' job_memory = PARAMS["job_memory"] load_statement = P.build_load_statement( P.toTable(outfile), options="--allow-empty-file " "--header-names=feature,Name") statement = ''' export LANG=en_GB.UTF-8 && zcat %(infile)s | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null | grep -v "#" | cut -f3,12 |%(load_statement)s > %(outfile)s''' P.run()
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return load_statement = P.build_load_statement( tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run()
def loadTranscriptStats(infile, outfile): '''compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str") statement = ''' gunzip < %(infile)s |\ cgat gtf2table \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run()
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """python %(scriptsdir)s/combine_tables.py --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run()
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run()
def loadGeneInformation(infile, outfile, only_proteincoding=False): """load gene-related attributes from :term:`gtf` file into database. This method takes transcript-associated features from an :term:`gtf` file and collects the gene-related attributes in the 9th column of the gtf file, ignoring exon_id, transcript_id, transcript_name, protein_id and exon_number. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename, contains logging information. The table name is derived from the filename of outfile. only_proteincoding : bool If True, only consider protein coding genes. """ job_memory = "4G" table = P.toTable(outfile) if only_proteincoding: filter_cmd = ( """python %(scriptsdir)s/gtf2gtf.py --method=filter --filter-method=proteincoding""" % PARAMS ) else: filter_cmd = "cat" load_statement = P.build_load_statement( table, options="--add-index=gene_id " "--add-index=gene_name" "--map=gene_name:str" ) statement = """ zcat %(infile)s | %(filter_cmd)s | grep "transcript_id" | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene+transcript | python %(scriptsdir)s/gtf2tsv.py --attributes-as-columns --output-only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id transcript_id transcript_name protein_id exon_number | %(pipeline_scriptsdir)s/hsort 1 | uniq | %(load_statement)s > %(outfile)s""" P.run()
def loadGeneInformation(infile, outfile, only_proteincoding=False): '''load gene-related attributes from :term:`gtf` file into database. This method takes transcript-associated features from an :term:`gtf` file and collects the gene-related attributes in the 9th column of the gtf file, ignoring exon_id, transcript_id, transcript_name, protein_id and exon_number. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename, contains logging information. The table name is derived from the filename of outfile. only_proteincoding : bool If True, only consider protein coding genes. ''' job_memory = "4G" table = P.toTable(outfile) if only_proteincoding: filter_cmd = """cgat gtf2gtf --method=filter --filter-method=proteincoding""" % PARAMS else: filter_cmd = "cat" load_statement = P.build_load_statement( table, options="--add-index=gene_id " "--add-index=gene_name" "--map=gene_name:str") statement = ''' zcat %(infile)s | %(filter_cmd)s | grep "transcript_id" | cgat gtf2gtf --method=sort --sort-order=gene+transcript | cgat gtf2tsv --attributes-as-columns --output-only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id transcript_id transcript_name protein_id exon_number | %(pipeline_scriptsdir)s/hsort 1 | uniq | %(load_statement)s > %(outfile)s''' P.run()
def loadMotifSequenceComposition(infile, outfile): '''compute sequence composition of sequences used for ab-initio search.''' load_statement = P.build_load_statement(P.toTable(outfile)) statement = ''' python %(scriptsdir)s/fasta2table.py --section=na --log=%(outfile)s < %(infile)s | %(load_statement)s > %(outfile)s''' P.run()
def loadTranscriptInformation(infile, outfile, only_proteincoding=False): '''load transcript-related attributes from :term:`gtf` file into database. This method takes transcript-associated features from an :term:`gtf` file and collects the gene-related attributes in the 9th column of the gtf file, ignoring exon_id and exon_number. To handle different Ensembl versions, gene_biotype and transcript_support are enforced if they are missing. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename, contains logging information. The table name is derived from the filename of outfile. only_proteincoding : bool If True, only consider protein coding genes. ''' table = P.toTable(outfile) if only_proteincoding: filter_cmd = """cgat gtf2gtf --method=filter --filter-method=proteincoding""" % PARAMS else: filter_cmd = "cat" load_statement = P.build_load_statement( table, options="--add-index=gene_id " "--add-index=gene_name" "--add-index=protein_id" "--add-index=transcript_id" "--map=gene_name:str") statement = '''zcat < %(infile)s | awk '$3 == "CDS"' | grep "transcript_id" | cgat gtf2gtf --method=sort --sort-order=gene+transcript | cgat gtf2tsv --attributes-as-columns --output-only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number | %(pipeline_scriptsdir)s/hsort 1 | uniq | %(load_statement)s > %(outfile)s''' P.run()
def loadPolyphen(infile, outfile): '''load polyphen results.''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=snp_id " "--add-index=protein_id " "--map=effect:str") statement = ''' gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;" | %(load_statement)s > %(outfile)s ''' P.run()
def loadProteinStats(infile, outfile): '''compute and load protein sequence properties into database. The method computes amino acid composition, length, and hash for each peptide sequence. The method calls :doc:`fasta2table` with the following counters: * length - protein sequence length * hid - protein sequence hash identifier * aa - protein sequence composition Arguments --------- infile : string Fiename of ENSEMBL peptide file in :term:`fasta` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=protein_id " "--map=protein_id:str") # the awk statement truncates ids ENSPXXX.1 to ENSPXXX # necessary for downstream compatibility (e.g. seleno list) statement = ''' gunzip < %(infile)s | cgat fasta2fasta --method=filter --filter-method=min-length=1 | awk 'match($0, /(>ENS[A-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]} !/^>/ {print}' | cgat fasta2table --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | %(load_statement)s > %(outfile)s''' P.run()
def loadPolyphen(infile, outfile): '''load polyphen results.''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=snp_id " "--add-index=protein_id " "--map=effect:str") statement = ''' gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;" | %(load_statement)s > %(outfile)s ''' P.run()
def loadProteinStats(infile, outfile): '''compute and load protein sequence properties into database. The method computes amino acid composition, length, and hash for each peptide sequence. The method calls :doc:`fasta2table` with the following counters: * length - protein sequence length * hid - protein sequence hash identifier * aa - protein sequence composition Arguments --------- infile : string Fiename of ENSEMBL peptide file in :term:`fasta` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=protein_id " "--map=protein_id:str") statement = ''' gunzip < %(infile)s | cgat fasta2fasta --method=filter --filter-method=min-length=1 | awk 'match($0, /(>[a-zA-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]} !/^>/ {print}' | cgat fasta2table --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | %(load_statement)s > %(outfile)s''' P.run()
def loadGeneCoordinates(infile, outfile): """merge transcripts to generate the genomic coordinates per gene and load """ # TS. remove transcript_id column as this is now meaningless load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file " ) statement = """ gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s""" P.run()
def loadGeneCoordinates(infile, outfile): '''merge transcripts to generate the genomic coordinates per gene and load ''' # TS. remove transcript_id column as this is now meaningless load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s''' P.run()
def loadProteinStats(infile, outfile): '''compute and load protein sequence properties into database. The method computes amino acid composition, length, and hash for each peptide sequence. The method calls :doc:`fasta2table` with the following counters: * length - protein sequence length * hid - protein sequence hash identifier * aa - protein sequence composition Arguments --------- infile : string Fiename of ENSEMBL peptide file in :term:`fasta` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=protein_id " "--map=protein_id:str") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | %(load_statement)s > %(outfile)s''' P.run()
def loadProteinStats(infile, outfile): """compute and load protein sequence properties into database. The method computes amino acid composition, length, and hash for each peptide sequence. The method calls :doc:`fasta2table` with the following counters: * length - protein sequence length * hid - protein sequence hash identifier * aa - protein sequence composition Arguments --------- infile : string Fiename of ENSEMBL peptide file in :term:`fasta` format. outfile : string Logfile. The table name is derived from `outfile`. """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=protein_id " "--map=protein_id:str" ) statement = """ gunzip < %(infile)s | python %(scriptsdir)s/fasta2fasta.py --method=filter --filter-method=min-length=1 | python %(scriptsdir)s/fasta2table.py --log=%(outfile)s --sequence-type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | %(load_statement)s > %(outfile)s""" P.run()
def loadTranscript2Gene(infile, outfile): """build a map of transcript to gene from gtf file and load into database. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. """ load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " ) statement = """ gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py --output-map=transcript2gene -v 0 | %(load_statement)s > %(outfile)s""" P.run()
def loadGeneCoordinates(infile, outfile): '''merge transcripts to generate the genomic coordinates per gene and load ''' # TS. remove transcript_id column as this is now meaningless load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts | cgat gtf2tsv | %(load_statement)s > %(outfile)s''' P.run()
def loadDapars(infiles, outfile): '''Munge the DaPars output to seperate transcript and gene_ids, and load into database''' infiles = " ".join(infiles) statement = '''python %(scriptsdir)s/combine_tables.py --cat=track --use-file-prefix --regex-filename='dapars_out.dir/(.+)/dapars_out' %(infiles)s -L %(outfile)s | sed 's/[|]/\\t/g' | sed '1!b;s/Gene/transcript_id\\tgene_id\\tchrom\\tstrand/' | %(load_statement)s > %(outfile)s''' load_statement = P.build_load_statement( P.toTable(outfile), options="-i track -i gene_id -i transcript_id") P.run()
def loadTranscript2Gene(infile, outfile): '''build a map of transcript to gene from gtf file and load into database. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py --output-map=transcript2gene -v 0 | %(load_statement)s > %(outfile)s''' P.run()
def loadmiRNATranscripts(infile, outfile): '''load transcripts from a GFF3 file into the database. Arguments --------- infile : string ENSEMBL geneset in :term:`gff3` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--allow-empty-file " "--header-names=feature,Name") statement = ''' export LANG=en_GB.UTF-8 && zcat %(infile)s | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null | grep -v "#" | cut -f3,12 |%(load_statement)s > %(outfile)s''' P.run()
def loadBigWigStats(infiles, outfile): '''merge and load bigwig summary for all wiggle files. Summarise and merge bigwig files for all samples and load into a table called bigwig_stats Parameters ---------- infiles : list Input filenames in :term:`bigwig` format outfile : string Output filename, the table name is derived from `outfile`. ''' data = " ".join( ['<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x for x in infiles]) headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles]) load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=track") statement = '''cgat combine_tables --header-names=%(headers)s --skip-titles --missing-value=0 --ignore-empty %(data)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s ''' P.run()
def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run()
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s''' P.run()
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall() ] genelists = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall() ] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) load_statement = P.build_load_statement(tablename=tablename) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | %(load_statement)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty-file')
def loadBAMStats(infiles, outfile): '''load output of :func:`buildBAMStats` into database. Arguments --------- infiles : string Input files, output from :func:`buildBAMStats`. outfile : string Logfile. The table name will be derived from `outfile`. ''' header = ",".join([P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) load_statement = P.build_load_statement( tablename, options="--add-index=track " " --allow-empty-file") E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s""" P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options="--allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run() # load mapping qualities, there are two columns per row # 'all_reads' and 'filtered_reads' # Here, only filtered_reads are used (--take=3) for suffix in ("mapq",): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options=" --allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty --take=3 %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run()
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) load_statement = P.build_load_statement( tablename=tablename) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | %(load_statement)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty-file') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty-file') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty-file')