def importReference( infile, outfile ): '''import reference domains. ''' track = re.sub("[.].*", "", os.path.basename(infile ) ) tablename_domains = "nrdb40_%s_domains" % track tablename_families = "nrdb40_%s_families" % track filename_families = re.sub( "domains", "families", infile ) statement = ''' python %(scriptsdir)s/DomainsReference.py --Database=%(database)s --domains=%(database)s.%(tablename_domains)s_src --families=%(database)s.%(tablename_families)s_src --mapped_domains=%(database)s.%(tablename_domains)s --mapped_families=%(database)s.%(tablename_families)s --input=%(infile)s --descriptions=%(filename_families)s --source=%(database)s.%(eval_tablename_adda_nids)s Create UpdateDomains MakeNonRedundantClone > %(outfile)s ''' P.run()
def buildOverlapTable( infiles, outfile ): '''calculate overlap between the different sources of domains.''' infiles = " ".join(infiles) statement = ''' python %(scriptsdir)s/set_diff.py --add-percent %(infiles)s > %(outfile)s ''' P.run()
def indexGraph(infile, outfile): '''index graph and store in compressed format.''' cmd = "index" to_cluster = True job_options = "-l mem_free=50G" statement = ADDA_STATEMENT P.run()
def indexSequences(infile, outfile ): '''index sequence database and map to internal identifiers. ''' cmd = "sequences" statement = ADDA_STATEMENT to_cluster = True P.run()
def importReference(infile, outfile): '''import reference domains. ''' track = re.sub("[.].*", "", os.path.basename(infile)) tablename_domains = "nrdb40_%s_domains" % track tablename_families = "nrdb40_%s_families" % track filename_families = re.sub("domains", "families", infile) statement = ''' python %(scriptsdir)s/DomainsReference.py --Database=%(database)s --domains=%(database)s.%(tablename_domains)s_src --families=%(database)s.%(tablename_families)s_src --mapped_domains=%(database)s.%(tablename_domains)s --mapped_families=%(database)s.%(tablename_families)s --input=%(infile)s --descriptions=%(filename_families)s --source=%(database)s.%(eval_tablename_adda_nids)s Create UpdateDomains MakeNonRedundantClone > %(outfile)s ''' P.run()
def buildOverlapTable(infiles, outfile): '''calculate overlap between the different sources of domains.''' infiles = " ".join(infiles) statement = ''' python %(scriptsdir)s/set_diff.py --add-percent %(infiles)s > %(outfile)s ''' P.run()
def collectADDASequences(infile, outfile): '''unpack adda sequences.''' if infile.endswith(".gz"): statement = '''gunzip < %(infile)s > %(outfile)s''' else: statement = '''ln -s %(infile)s %(outfile)s''' P.run()
def collectADDASequences( infile, outfile ): '''unpack adda sequences.''' if infile.endswith(".gz"): statement = '''gunzip < %(infile)s > %(outfile)s''' else: statement = '''ln -s %(infile)s %(outfile)s''' P.run()
def exportResults(infile, outfile): '''export Adda results.''' statement = ''' tar -cvzf %(outfile)s %(output_result)s %(output_families)s %(output_summary)s ''' P.run()
def exportResults( infile, outfile ): '''export Adda results.''' statement = ''' tar -cvzf %(outfile)s %(output_result)s %(output_families)s %(output_summary)s ''' P.run()
def reindexSequences( infile, outfile ): '''rebuild the adda sequence database from adda.nids.''' database = outfile[:-len(".fasta")] statement = ''' awk '!/^nid/ { printf(">%%s\\n%%s\\n", $1, $5)};' < %(infile)s | python %(scriptsdir)s/IndexedFasta.py %(database)s - > %(outfile)s.log''' P.run()
def reindexSequences(infile, outfile): '''rebuild the adda sequence database from adda.nids.''' database = outfile[:-len(".fasta")] statement = ''' awk '!/^nid/ { printf(">%%s\\n%%s\\n", $1, $5)};' < %(infile)s | python %(scriptsdir)s/IndexedFasta.py %(database)s - > %(outfile)s.log''' P.run()
def buildBlatIndex(infiles, outfile): '''build blat index.''' infiles = " ".join(infiles) statement = ''' blat -dots=100 -prot -makeOoc=%(outfile)s -minIdentity=%(map_min_identity)i %(infiles)s %(outfile)s.log < /dev/null >> %(outfile)s.log ''' P.run()
def buildBlatIndex( infiles, outfile): '''build blat index.''' infiles = " ".join( infiles ) statement = ''' blat -dots=100 -prot -makeOoc=%(outfile)s -minIdentity=%(map_min_identity)i %(infiles)s %(outfile)s.log < /dev/null >> %(outfile)s.log ''' P.run()
def collectTargetSequences(infiles, outfile): '''extract new sequences from input.''' filename_target, filename_adda = infiles statement = ''' python %(scriptsdir)s/map_fasta2fasta.py --filename-reference=%(filename_adda)s --output-filename-pattern=target.%%s %(filename_target)s > %(outfile)s.log ''' P.run()
def collectTargetSequences( infiles, outfile ): '''extract new sequences from input.''' filename_target, filename_adda = infiles statement = ''' python %(scriptsdir)s/map_fasta2fasta.py --filename-reference=%(filename_adda)s --output-filename-pattern=target.%%s %(filename_target)s > %(outfile)s.log ''' P.run()
def splitSequenceFile(infile, outfiles): # patch ruffus bug if type(infile) == type(list()): infile = infile[0] statement = ''' perl %(scriptsdir)s/split_fasta.pl -a blat.dir/chunk_%%s.fasta %(map_chunksize)i < %(infile)s > split.log ''' P.run()
def buildIndirectDomains(infiles, outfile): '''collect domains mapped from domains mapped via BLAT.''' infiles = " ".join(infiles) statement = ''' cat %(infiles)s | python %(scriptsdir)s/substitute_tokens.py --apply=target.new2new.map --column=1 --invert \ --filter > %(outfile)s ''' P.run()
def splitSequenceFile( infile, outfiles ): # patch ruffus bug if type(infile) == type(list()): infile = infile[0] statement = ''' perl %(scriptsdir)s/split_fasta.pl -a blat.dir/chunk_%%s.fasta %(map_chunksize)i < %(infile)s > split.log ''' P.run()
def exportPfam(infile, outfile): '''export Adda results.''' outdir = time.strftime("%Y_%m_%d", time.localtime(time.time())) statement = ''' mkdir %(outdir)s; awk '!/^nid/ {printf("%%s\\n%%s\\n", $1, $5);}' < %(output_nids) > %(outdir)s/adda.fasta; ln -s ../adda.result %(outdir)s/adda.result; tar -cvzf %(outfile)s %(outdir)s; rm -rf %(outdir)s ''' P.run()
def exportPfam( infile, outfile ): '''export Adda results.''' outdir = time.strftime( "%Y_%m_%d", time.localtime(time.time())) statement = ''' mkdir %(outdir)s; awk '!/^nid/ {printf("%%s\\n%%s\\n", $1, $5);}' < %(output_nids) > %(outdir)s/adda.fasta; ln -s ../adda.result %(outdir)s/adda.result; tar -cvzf %(outfile)s %(outdir)s; rm -rf %(outdir)s ''' P.run()
def buildIndirectDomains( infiles, outfile ): '''collect domains mapped from domains mapped via BLAT.''' infiles = " ".join(infiles) statement = ''' cat %(infiles)s | python %(scriptsdir)s/substitute_tokens.py --apply=target.new2new.map --column=1 --invert \ --filter > %(outfile)s ''' P.run()
def buildMappingCoverage(infiles, outfile): '''compute coverage of target sequences with ADDA domains.''' filename_domains, filename_lengths = infiles statement = ''' python %(scriptsdir)s/adda2coverage.py --log=%(outfile)s.log --filename-lengths=%(filename_lengths)s --output-filename-pattern="%(outfile)s_%%s" < %(filename_domains)s > %(outfile)s ''' P.run()
def importADDAResults( infile, outfile ): '''import ADDA results.''' statement = ''' python %(scriptsdir)s/DomainsAdda.py --Database=%(database)s --domains=%(database)s.nrdb40_%(tablename_adda)s_domains --families=%(database)s.nrdb40_%(tablename_adda)s_families --input=%(infile)s --source=%(database)s.%(eval_tablename_adda_nids)s Create Finalize UpdateDomains > %(outfile)s ''' P.run()
def buildMappingCoverage( infiles, outfile ): '''compute coverage of target sequences with ADDA domains.''' filename_domains, filename_lengths = infiles statement = ''' python %(scriptsdir)s/adda2coverage.py --log=%(outfile)s.log --filename-lengths=%(filename_lengths)s --output-filename-pattern="%(outfile)s_%%s" < %(filename_domains)s > %(outfile)s ''' P.run()
def importADDAResults(infile, outfile): '''import ADDA results.''' statement = ''' python %(scriptsdir)s/DomainsAdda.py --Database=%(database)s --domains=%(database)s.nrdb40_%(tablename_adda)s_domains --families=%(database)s.nrdb40_%(tablename_adda)s_families --input=%(infile)s --source=%(database)s.%(eval_tablename_adda_nids)s Create Finalize UpdateDomains > %(outfile)s ''' P.run()
def mapDomains( infile, outfile ): '''collect blat matching stats.''' to_cluster= True job_options = "-l mem_free=4000M" statement = '''bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 -o %(infile)s.out2 -I "gunzip < %(infile)s | python %(scriptsdir)s/map_blat2adda.py --filename-domains=adda.results --output-filename-pattern="%(outfile)s.%%s" --log=%(outfile)s.log --verbose=2 > %(outfile)s" ''' P.run()
def buildDirectDomains(infiles, outfile): '''collect domains that could be transfered without mapping.''' x, filename_domains = infiles statement = '''gunzip < %(filename_domains)s | python %(scriptsdir)s/substitute_tokens.py --apply=target.new2old.map --invert --column=1 --filter > %(outfile)s ''' P.run()
def buildDirectDomains( infiles, outfile ): '''collect domains that could be transfered without mapping.''' x, filename_domains = infiles statement = '''gunzip < %(filename_domains)s | python %(scriptsdir)s/substitute_tokens.py --apply=target.new2old.map --invert --column=1 --filter > %(outfile)s ''' P.run()
def mapDomains( infile, outfile ): '''collect blat matching stats.''' to_cluster= True job_options = "-l mem_free=4000M" statement = '''gunzip < %(infile)s | python %(scriptsdir)s/map_blat2adda.py --filename-domains=<( gunzip < %(map_filename_domains)s) --output-filename-pattern="%(outfile)s.%%s" --log=%(outfile)s.log --verbose=2 > %(outfile)s ''' P.run()
def mapDomains(infile, outfile): '''collect blat matching stats.''' to_cluster = True job_options = "-l mem_free=4000M" statement = '''bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 -o %(infile)s.out2 -I "gunzip < %(infile)s | python %(scriptsdir)s/map_blat2adda.py --filename-domains=adda.results --output-filename-pattern="%(outfile)s.%%s" --log=%(outfile)s.log --verbose=2 > %(outfile)s" ''' P.run()
def runBlat( infiles, outfile ): '''run a blat job.''' to_cluster = True infile, fasta = infiles statement = ''' blat -prot -ooc=5.ooc -noHead -minIdentity=%(map_min_identity)i %(fasta)s %(infile)s stdout | gzip > %(outfile)s ''' P.run()
def runBlat( infiles, outfile ): '''run a blat job.''' to_cluster = True infile, fasta = infiles statement = '''bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 -o %(fasta)s.out -I "blat -prot -ooc=5.ooc -noHead -minIdentity=%(map_min_identity)i %(fasta)s %(infile)s %(infile)s.out && cat %(infile)s.out | gzip > %(outfile)s" ''' P.run()
def runBlat(infiles, outfile): '''run a blat job.''' to_cluster = True infile, fasta = infiles statement = '''bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 -o %(fasta)s.out -I "blat -prot -ooc=5.ooc -noHead -minIdentity=%(map_min_identity)i %(fasta)s %(infile)s %(infile)s.out && cat %(infile)s.out | gzip > %(outfile)s" ''' P.run()
def importADDAIntermediateResults( infile, outfile ): '''import the segmentation segments. Nids are translated. ''' table = outfile[:-len(".import")] statement = ''' python %(scriptsdir)s/adda_translate.py --nids=%(eval_filename_adda_nids)s < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --database=%(database)s --table=%(table)s --index=nid > %(outfile)s ''' P.run()
def importADDAIntermediateResults(infile, outfile): '''import the segmentation segments. Nids are translated. ''' table = outfile[:-len(".import")] statement = ''' python %(scriptsdir)s/adda_translate.py --nids=%(eval_filename_adda_nids)s < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --database=%(database)s --table=%(table)s --index=nid > %(outfile)s ''' P.run()
def importSequences( infile, outfile ): '''import sequences into database. This command will also create the database ''' statement = ''' mysql %(load_mysql_options)s -e "DROP DATABASE IF EXISTS %(load_database)s" ''' P.run() statement = ''' mysql %(load_mysql_options)s -e "CREATE database %(load_database)s" ''' P.run() statement =''' perl -p -e "s/nid/adda_nid/; s/pid/nid/" < %(infile)s | python %(scriptsdir)s/csv2db.py %(load_csv2db_options)s --database=%(load_database)s --table=%(load_tablename_adda_nrdb)s --map=nid:int --index=nid > %(outfile)s ''' P.run()
def importSequences( infile, outfile ): '''import sequences. This command will also create the database ''' statement = ''' mysql %(mysql_options)s -e "DROP DATABASE IF EXISTS %(load_database)s" ''' P.run() statement = ''' mysql %(mysql_options)s -e "CREATE database %(load_database)s" ''' P.run() table = outfile[:-len(".import")] statement =''' perl -p -e "s/nid/adda_nid/; s/pid/nid/" < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --database=%(database)s --table=%(table)s --index=nid > %(outfile)s ''' P.run()
def annotateADDA( infile, outfile ): '''annotate ADDA families with reference families ''' track = outfile[:-len(".annotations")] statement = ''' python %(scriptsdir)s/OutputStatisticsClustering.py --Database=%(database)s --domains=%(database)s.nrdb40_%(tablename_adda)s_domains --families=%(database)s.nrdb40_%(tablename_adda)s_families --max_family=%(eval_max_family_size)i --min_evidence=2 --min_units=2 --ref_domains=%(database)s.nrdb40_%(track)s_domains --ref_families=%(database)s.nrdb40_%(track)s_families --full-table Annotation > %(outfile)s ''' P.run() statement = ''' perl %(scriptsdir)s/calculate_selectivity.pl < %(outfile)s > %(outfile)s.selectivity ''' P.run() statement = ''' perl %(scriptsdir)s/calculate_sensitivity.pl < %(outfile)s > %(outfile)s.sensitivity ''' P.run()
def annotateADDA(infile, outfile): '''annotate ADDA families with reference families ''' track = outfile[:-len(".annotations")] statement = ''' python %(scriptsdir)s/OutputStatisticsClustering.py --Database=%(database)s --domains=%(database)s.nrdb40_%(tablename_adda)s_domains --families=%(database)s.nrdb40_%(tablename_adda)s_families --max_family=%(eval_max_family_size)i --min_evidence=2 --min_units=2 --ref_domains=%(database)s.nrdb40_%(track)s_domains --ref_families=%(database)s.nrdb40_%(track)s_families --full-table Annotation > %(outfile)s ''' P.run() statement = ''' perl %(scriptsdir)s/calculate_selectivity.pl < %(outfile)s > %(outfile)s.selectivity ''' P.run() statement = ''' perl %(scriptsdir)s/calculate_sensitivity.pl < %(outfile)s > %(outfile)s.sensitivity ''' P.run()
def importSequences(infile, outfile): '''import sequences. This command will also create the database ''' statement = ''' mysql %(mysql_options)s -e "DROP DATABASE IF EXISTS %(load_database)s" ''' P.run() statement = ''' mysql %(mysql_options)s -e "CREATE database %(load_database)s" ''' P.run() table = outfile[:-len(".import")] statement = ''' perl -p -e "s/nid/adda_nid/; s/pid/nid/" < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --database=%(database)s --table=%(table)s --index=nid > %(outfile)s ''' P.run()
def evaluateDomains( infile, outfile ): '''benchmark domains. The domain benchmark checks if the appropriate domains have been selected by the optimisation method. ''' track = outfile[:-len("_domains.eval")] statement = ''' python %(scriptsdir)s/evaluate_domain_boundaries.py --database=%(database)s --reference=%(database)s.nrdb40_%(track)s_domains --parts=%(database)s.%(eval_tablename_domains)s --output-filename-pattern=%(outfile)s.%%s --switch --skip-repeats --bin-size=1 > %(outfile)s ''' P.run()
def evaluateDomains(infile, outfile): '''benchmark domains. The domain benchmark checks if the appropriate domains have been selected by the optimisation method. ''' track = outfile[:-len("_domains.eval")] statement = ''' python %(scriptsdir)s/evaluate_domain_boundaries.py --database=%(database)s --reference=%(database)s.nrdb40_%(track)s_domains --parts=%(database)s.%(eval_tablename_domains)s --output-filename-pattern=%(outfile)s.%%s --switch --skip-repeats --bin-size=1 > %(outfile)s ''' P.run()
def evaluateSegments(infile, outfile): '''evaluate ADDA segments against reference The tree benchmark checks whether the segmentation algorithm contains the appropriate reference domains. ''' track = outfile[:-len("_segments.eval")] statement = ''' python %(scriptsdir)s/evaluate_domain_boundaries.py --database=%(database)s --reference=%(database)s.nrdb40_%(track)s_domains --trees=%(database)s.%(eval_tablename_segments)s --output-filename-pattern=%(outfile)s.%%s --switch --skip-repeats --no-full-length --bin-size=1 > %(outfile)s ''' P.run()
def evaluateSegments( infile, outfile ): '''evaluate ADDA segments against reference The tree benchmark checks whether the segmentation algorithm contains the appropriate reference domains. ''' track = outfile[:-len("_segments.eval")] statement = ''' python %(scriptsdir)s/evaluate_domain_boundaries.py --database=%(database)s --reference=%(database)s.nrdb40_%(track)s_domains --trees=%(database)s.%(eval_tablename_segments)s --output-filename-pattern=%(outfile)s.%%s --switch --skip-repeats --no-full-length --bin-size=1 > %(outfile)s ''' P.run()
def computeParameters(infile, outfile ): '''pre-process graph.''' cmd = "fit" statement = ADDA_STATEMENT P.run()
def clusterDomains(infile, outfile): cmd = "cluster" statement = ADDA_STATEMENT P.run()
def buildAddaSummary(infile, outfile): cmd = "summary" statement = ADDA_STATEMENT P.run()
def buildFamilies(infile, outfile): cmd = "families" statement = ADDA_STATEMENT P.run()
for command line help. Code ---- """ import sys, tempfile, optparse, shutil, itertools, csv, math, random, re, glob, os, shutil import fileinput, collections, gzip import Adda.Experiment as E import Adda.Pipeline as P from ruffus import * import csv import sqlite3 from Adda import IndexedFasta, FastaIterator, IOTools, AddaIO PARAMS = P.getParameters("adda.ini") @files(PARAMS["eval_filename_alignment_graph"], ("alignment_graph.gz", "alignment.stats")) def annotateAlignmentGraph(infile, outfiles): '''input the alignment graph and output a translated version of it and adding reference domain information. ''' outfile, outfile_stats = outfiles # collect benchmark domains E.info("reading benchmark domains") benchmark_domains = AddaIO.readMapNid2Domains(
def indexGraph(infile, outfile): '''index graph and store in compressed format.''' cmd = "index" statement = ADDA_STATEMENT P.run()
def computeMSTComponents(infile, outfile): cmd = "mst-components" statement = ADDA_STATEMENT P.run()
def indexSequences(infile, outfile): '''index sequence database and map to internal identifiers. ''' cmd = "sequences" statement = ADDA_STATEMENT P.run()
def alignDomains(infile, outfile): cmd = "align" statement = ADDA_STATEMENT P.run()