def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run(statement)
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ load_statement = P.build_load_statement( P.to_table(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run(statement, job_memory=PARAMS["job_memory"])
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): iotools.touch_file(outfile) return load_statement = P.build_load_statement(tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run(statement)
def loadGeneInformation(infile, outfile, only_proteincoding=False, job_memory="4G"): '''load gene-related attributes from :term:`gtf` file into database. This method takes transcript-associated features from an :term:`gtf` file and collects the gene-related attributes in the 9th column of the gtf file, ignoring exon_id, transcript_id, transcript_name, protein_id and exon_number. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename, contains logging information. The table name is derived from the filename of outfile. only_proteincoding : bool If True, only consider protein coding genes. ''' table = P.to_table(outfile) if only_proteincoding: filter_cmd = """cgat gtf2gtf --method=filter --filter-method=proteincoding""" % PARAMS else: filter_cmd = "cat" load_statement = P.build_load_statement(table, options="--add-index=gene_id " "--add-index=gene_name " "--map=gene_name:str") statement = ''' zcat %(infile)s | %(filter_cmd)s | grep "transcript_id" | cgat gtf2gtf --method=sort --sort-order=gene+transcript | cgat gtf2tsv --attributes-as-columns --output-only-attributes -v 0 | cgat csv-cut --remove exon_id transcript_id transcript_name protein_id exon_number | (read h; echo "$h"; sort ) | uniq | %(load_statement)s > %(outfile)s''' P.run(statement, job_memory=job_memory)
def loadMotifSequenceComposition(infile, outfile): '''compute sequence composition of sequences used for ab-initio search.''' load_statement = P.build_load_statement( P.to_table(outfile)) statement = ''' cgat fasta2table --section=na --log=%(outfile)s < %(infile)s | %(load_statement)s > %(outfile)s''' P.run(statement)
def loadFimo(infile, outfile): to_cluster = True tablename = os.path.basename(outfile).replace(".load", "").replace(".", "_") options = '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" ' statement = [] statement.append(f'''cat {infile} | ''') statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement.append(f''' > {outfile}''') statement = ' '.join(statement) P.run(statement)
def loadPolyphen(infile, outfile): '''load polyphen results.''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=snp_id " "--add-index=protein_id " "--map=effect:str") statement = ''' gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;" | %(load_statement)s > %(outfile)s ''' P.run()
def loadDremeTomTom(infile, outfile): '''load dreme tomtom results''' to_cluster = True tablename = os.path.basename(outfile).replace(".load", "").replace(".", "_") options = '-H "query_id,target_id,optimal_offset,p_value,e_value,q_value,overlap,query_consensus,targe_consensus,orientation" ' statement = [] statement.append('''cat %(infile)s | ''') statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement.append(''' > %(outfile)s''') statement = ' '.join(statement) P.run(statement)
def loadPeaks(infile, outfile): '''Load input peaks to merge w/ MAST results by peak_id''' tablename = os.path.basename(outfile).replace(".load", "").replace(".", "_") options = '-H "contig,start,end,peak_id,score" ' statement = [] statement.append(f'''cat {infile} | cut -f1-5 - |''') statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement.append(f''' > {outfile}''') statement = ' '.join(statement) to_cluster = True P.run(statement)
def loadMemeTomTom(infile, outfile): '''load meme tomtom results''' # cgat default is not to submit P.load() jobs to cluster # this is hardcoded in P.load() tablename = os.path.basename(outfile).replace(".load", "").replace(".", "_") options = '-H "query_id,target_id,optimal_offset,p_value,e_value,q_value,overlap,query_consensus,targe_consensus,orientation" ' statement = [] statement.append('''cat %(infile)s | ''') statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement.append(''' > %(outfile)s''') statement = ' '.join(statement) to_cluster = True P.run(statement)
def loadMast(infile, outfile): '''MAST results section1 contains high scoring sequences (for input motifs), ranked by increasing e-value (up to a max of 10)''' tmp_dir = "$SCRATCH_DIR" tablename = "MAST_" + outfile.split("/")[-2].replace(".", "_") options = '-H "peak_id,e_value,length" ' statement = [] statement.append(f'''tmp=`mktemp -p {tmp_dir}` && sed -n '/^SECTION I:/,/^SECTION II:/p' {infile} | grep "^[a-zA-Z0-9].*[0-9]$" - | tr -s "[[:blank:]]" "\\t" > $tmp && cat $tmp ''') statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement = ' | '.join(statement) + f''' > {outfile} && rm $tmp''' P.run(statement)
def loadmiRNATranscripts(infile, outfile): '''load transcripts from a GFF3 file into the database. Arguments --------- infile : string ENSEMBL geneset in :term:`gff3` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--allow-empty-file " "--header-names=feature,Name") statement = ''' export LANG=en_GB.UTF-8 && zcat %(infile)s | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null | grep -v "#" | cut -f3,12 |%(load_statement)s > %(outfile)s''' P.run(statement, job_memory=PARAMS["job_memory"])
def getMotifIDs(infile, outfile): '''Create motif_ID:TF lookup table''' tmp_dir = "$SCRATCH_DIR" tablename = os.path.basename(infile).replace(".txt", "").replace(".", "_") filename = outfile.replace(".load", ".txt") options = '-H "pattern_name,TF" ' statement = [] statement.append(f'''tmp=`mktemp -p {tmp_dir}` && grep "^MOTIF" {infile} | sed 's/MOTIF //' | sed 's/(//' | sed 's/)//' | sed 's/\.//' | sed 's/\///' | sed 's/\\///' | sed 's/://' | sed 's/_//' | tr -s "[[:blank:]]" "\\t" > $tmp && cat $tmp | ''') # use sed to remove problematic chars e.g. ".", "(" etc. from motif names statement.append( P.build_load_statement(tablename, options=options, retry=True)) statement.append(f''' > {outfile} && mv $tmp {filename}''') statement = ' '.join(statement) to_cluster = True P.run(statement)
def loadBigWigStats(infiles, outfile): '''merge and load bigwig summary for all wiggle files. Summarise and merge bigwig files for all samples and load into a table called bigwig_stats Parameters ---------- infiles : list Input filenames in :term:`bigwig` format outfile : string Output filename, the table name is derived from `outfile`. ''' data = " ".join([ '<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x for x in infiles ]) headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles]) load_statement = P.build_load_statement(P.toTable(outfile), options="--add-index=track") statement = '''cgat combine_tables --header-names=%(headers)s --skip-titles --missing-value=0 --ignore-empty %(data)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s ''' P.run()
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2tsv -f | %(load_statement)s > %(outfile)s''' P.run(statement, job_memory=PARAMS["job_highmemory"])
def loadFimo(infile, outfile): '''load fimo results from tsv''' fimo_result = infile.replace(".fimo.log", "/fimo.txt") # escape clause if fimo fails to find motifs if len(pd.read_csv(fimo_result, sep="\t")) == 0: statement = f'''echo "file {fimo_result} empty" ''' else: tablename = '_'.join([outfile.split("/")[1], "fimo_results"]).replace(".", "_") opts = '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" ' statement = [] statement.append(f'''grep -v "#" {fimo_result} | ''') statement.append( P.build_load_statement(tablename, options=opts, retry=True)) statement.append(f''' > {outfile}''') statement = ' '.join(statement) to_cluster = True P.run(statement, job_memory="4G")
def loadBAMStats(infiles, outfile): '''load output of :func:`buildBAMStats` into database. Arguments --------- infiles : string Input files, output from :func:`buildBAMStats`. outfile : string Logfile. The table name will be derived from `outfile`. ''' header = ",".join([P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.to_table(outfile) load_statement = P.build_load_statement( tablename, options="--add-index=track " " --allow-empty-file") E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s""" to_cluster = False P.run(statement) for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options="--allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement) # load mapping qualities, there are two columns per row # 'all_reads' and 'filtered_reads' # Here, only filtered_reads are used (--take=3) for suffix in ("mapq",): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options=" --allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty --take=3 %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement)
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join([P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement( tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ to_cluster = False P.run(statement)
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) load_statement = P.build_load_statement( tablename=tablename) statement = ''' cgat combine_tables --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | %(load_statement)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty-file') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty-file') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty-file')