def mgs(config, name): print gettime("stat 10.mgs") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) material_dir = '%s/material' % os.path.dirname(config) if os.path.isdir(work_dir): pass else: mkdir(work_dir) config_group = ConfigParser() config_group.read(config) group = re.split('\s+|,\s*|\t+|,\t*|', config_group.get('param','group')) for (i,subgroup_name) in enumerate(group): #subgroup_filename = '0' + str((i+1)) + '.' + subgroup_name subgroup_filename = subgroup_name mkdir("%s/%s" % (work_dir, subgroup_filename)) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group("%s/%s_group.list" % (material_dir, subgroup_name)) if min_sample_num_in_groups >= 5 and sample_num_total >= 20 and group_num == 2: os.system("cp %s/%s_group.list %s/%s/group.list" % (material_dir, subgroup_name, work_dir, subgroup_filename)) commands.append("## mgs start") commands.append('ls | while read a; do if [ -f "$a/group.list" ];then python %s/full_MGS_llf.py -p ../../06.gene_profile/gene.profile -g $a/group.list -d $a/; fi; done' % (bin_mgs_default_dir)) commands.append('ls | while read a; do if [ -f "$a/group.list" ];then cd $a;sh work.sh;cd -; fi; done') commands.append('ls | while read a; do if [ -f "$a/group.list" ];then python %s/mgs_taxonomy.py -i $a/pathway/ -g ../05.gene_catalog/gene_catalog.fna -o $a/taxonomy/ --group $a/group.list; fi; done' % (bin_mgs_default_dir)) commands.append('ls | while read a; do if [ -f "$a/group.list" ];then cd $a/taxonomy/;sh mgs_taxonomy.sh;cd -; fi; done') else: log = open("%s/%s/Sample_not_enough.log" % (work_dir, subgroup_filename),"w+") log.write("min_sample_num_in_groups >= 5 and sample_num_total >= 20 and group_num == 2") log.close return commands
def gene_profile_pre(config, name): print gettime("start 06.gene_profile_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("cp %s/../01.clean_reads/clean_reads.list %s/clean_reads.list"\ %(work_dir,work_dir)) commands.append("## build index") mkdir("%s/database/" % work_dir) commands.append( "#ln -s %s/../05.gene_catalog/gene_catalog.fna %s/database/" % (work_dir, work_dir)) commands.append("#2bwt-builder %s/database/gene_catalog.fna" % work_dir) commands.append("python %s/genebuild.py -d %s" % (bin_gene_profile_default_dir, work_dir)) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 1 --prefix BI --lines 1 shell/2bwt_builder.sh" ) commands.append("## calculate gene abundance") commands.append("perl %s/geneabundance.pl %s/clean_reads.list database/gene_catalog.fna %s/../05.gene_catalog/gene_catalog.length %s/"\ %(bin_gene_profile_default_dir,work_dir,work_dir,work_dir)) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 50 --prefix MA --lines 1 shell/match.sh" ) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 10 --prefix AB --lines 2 shell/abun.sh" ) print gettime("end 06.gene_profile_pre") return commands
def gene_catalog(config, name): commands = [] print gettime("start 05.gene_catalog") work_dir = '%s/%s' % (os.path.dirname(config), name) commands.append(command_default + "perl %s/cds2pep.pl %s/gene_catalog.fna %s/gene_catalog.faa"\ %(tool_default_dir,work_dir,work_dir)) commands.append("gzip -c %s/redundant.gene_catalog.fna > %s/redundant.gene_catalog.fna.gz"\ %(work_dir,work_dir)) commands.append("gzip -c %s/gene_catalog.fna > %s/gene_catalog.fna.gz"\ %(work_dir,work_dir)) commands.append("gzip -c %s/gene_catalog.faa > %s/gene_catalog.faa.gz"\ %(work_dir,work_dir)) commands.append("## info of gene catalog") commands.append(command_default + "perl %s/gene_catalog.stat.pl < %s/gene_catalog.fna > %s/gene_catalog.stat.tsv"\ %(bin_gene_catalog_default_dir,work_dir,work_dir)) commands.append("perl %s/lengthfasta.pl %s/gene_catalog.fna > %s/gene_catalog.length"\ %(tool_default_dir,work_dir,work_dir)) commands.append( "Rscript %s/../04.gene_predict/gene.histogram.R %s/gene_catalog.length %s/gene_catalog.length.histogram.pdf" % (bin_gene_catalog_default_dir, work_dir, work_dir)) commands.append("convert -density 300 %s/gene_catalog.length.histogram.pdf %s/gene_catalog.length.histogram.png"\ %(work_dir,work_dir)) commands.append("## split gene catalog") commands.append("perl %s/cutfasta.pl %s/gene_catalog.faa 10 > %s/gene_catalog.split.list"\ %(tool_default_dir,work_dir,work_dir)) print gettime("end 05.gene_catalog") return commands
def gene_predict(config, name): print gettime("start 04.gene_predict") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") mkdir(work_dir) commands.append("ls gene/*fna | perl %s/stat.pl > orf.stat.tsv" % bin_gene_predict_default_dir) commands.append( "ls gff/*gff | sed 's/.gff//g' | while read a ; do gzip -c $a.gff > $a.gff.gz;done" ) commands.append("ls gene/*fna | sed 's/.fna//g' | while read a ; do perl %s/cds2pep.pl $a.fna $a.faa; gzip -c $a.fna > $a.fna.gz; gzip -c $a.faa > $a.faa.gz; done"\ %tool_default_dir) commands.append("## histogram") mkdir("%s/histogram/" % work_dir) commands.append( "cut -f 1 gene.list | while read a; do /data_center_03/USER/zhongwd/bin/lengthfasta gene/$a.gene.fna > histogram/$a.gene.length; done" ) commands.append("cut -f 1 gene.list | while read a; do Rscript %s/gene.histogram.R histogram/$a.gene.length histogram/$a.gene.histogram.pdf; done"\ %bin_gene_predict_default_dir) commands.append( "cut -f 1 gene.list | while read a; do convert -density 300 histogram/$a.gene.histogram.pdf histogram/$a.gene.histogram.png; done" ) print gettime("end 04.gene_predict") return commands
def use_old_version(config, name): print gettime('start create old version step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/old_version' % (main_dir, name) mkdir(work_dir, '%s/profile'%work_dir) commands.append('## calculate abundance') commands.append('cp %s/01.clean_reads/clean_reads.list ./' % main_dir) commands.append('%s/speciesabundance.pl %s/01.clean_reads/clean_reads.list .' % (bin_dir, main_dir)) commands.append('nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 15G --jobs 10 --prefix MA --lines 1 shell/match.sh &') commands.append('nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 15G --jobs 10 --prefix AB --lines 2 shell/abun.sh &') commands.append('## form species profile') commands.append('ls alignment/*/*root.abundance >abund.list') #commands.append('python %s/02_taxonomy.py -d . -c ../%s/qc_%s.stat.tsv' % (bin_dir, raw_dir_name, batch_num)) commands.append('python %s/02_taxonomy.py -i abund.list' % bin_dir) commands.append('rm abund.list') commands.append('for i in all phylum class order family genus species; do ls alignment/*/*$i.abundance |perl %s/201_profile - >profile/$i.profile; done' % bin_dir) #commands.append('cut -f1 %s/materials/sample.list |while read a; do ls alignment/$a/*phylum.abundance; done | profile - >profile/phylum.profilea'%main_dir) commands.append('num=1;for i in phylum class order family genus species; do let num=num+1; python %s/201_profile_convert.py -i profile/$i.profile -o profile/otu_table_L$num.txt; done' % bin_dir) commands.append('ls profile/* | while read a; do cp $a ../../taxon_profile; done') commands.append('## reads use rate') commands.append('#ls alignment/*/*.MATCH.logs >match_logs.list') commands.append('#python %s/201_use_rate.py -i match_logs.list -o use_rate.stat.tsv -clean %s/00.raw_reads/qc_stat.tsv' % (bin_dir, main_dir)) commands.append('#rm match_logs.list') return work_dir, commands
def use_other_method(config, name): print gettime('start create other step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/other' % (main_dir, name) mkdir(work_dir) print 'This method is not complete,please select other method!' return work_dir, commands
def use_kraken2_method(config, name): print gettime('start create kraken2 step script') work_dir, commands = '',[] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/kraken2' % (main_dir, name) mkdir(work_dir) print 'This method is not complete,please select other method!' return work_dir, commands
def eggnog(config, name): print gettime("end 08.eggnog") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("rm %s/blat/all.m8" % work_dir) commands.append("cat %s/blat/* > %s/blat/all.m8" % (work_dir, work_dir)) commands.append(command_default + "python %s/701_pick_blast_m8.py -i %s/blat/all.m8 -o %s/eggnog.m8"%\ (bin_kegg_default_dir,work_dir,work_dir)) commands.append(command_default + "perl %s/03_get_annot_info.pl %s/eggnog.m8 /data_center_02/Database/eggNOGv4.0/all.members.txt /data_center_02/Database/eggNOGv4.0/all.description.txt /data_center_02/Database/eggNOGv4.0/all.funccat.txt %s/eggnog.m8.tab"%\ (bin_eggnog_default_dir,work_dir,work_dir)) commands.append("perl %s/04_get_count.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt %s/eggnog.tab"%\ (bin_eggnog_default_dir,work_dir,work_dir)) commands.append(command_default + "perl /data_center_07/Project/RY2015K16A01-1/08.eggnog/bin/eggnog.annotation.pl < %s/eggnog.m8.tab > %s/eggnog.anno.tsv"%\ (work_dir,work_dir)) #获取分组名称 config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,\s*|,\t+", config_gene.get("param", "group")) sample_names = config_gene.get("param", "sample_name") sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( sample_names) if sample_num_total <= 10: mkdir("%s/samples" % work_dir) commands.append("cut -f 1 %s/../01.clean_reads/clean_reads.list | while read a ; do cut -f 1 %s/../06.gene_profile/alignment/$a/$a.gene.abundance > %s/samples/$a.gene.list; done"%\ (work_dir,work_dir,work_dir)) commands.append("ls %s/samples/*gene.list | sed 's/.gene.list//g'|while read a; do perl %s/04_get_countlist.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt $a.gene.list $a.eggnog.tab;done"%\ (work_dir,bin_eggnog_default_dir,work_dir)) commands.append("ls %s/samples/*.eggnog.tab | sed 's/.eggnog.tab//g' | while read a;do cut -f 3,4 $a.eggnog.tab > $a.eggnog.count.tab; done"%\ (work_dir)) commands.append("ls %s/samples/*.eggnog.count.tab | /data_center_03/USER/zhongwd/bin/profile - > %s/eggnog.count.tab"%\ (work_dir,work_dir)) commands.append("Rscript /data_center_04/Projects/pichongbingdu/pair_reads/08.eggnog/NOG.R %s/eggnog.count.tab"%\ work_dir) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) work_dir_01 = "%s/group/%s/" % (work_dir, subgroup_name) mkdir(work_dir_01) commands.append( "## ----------------------------------%s----------------------" % (subgroup_name)) commands.append("cd %s; perl /data_center_06/Project/pracrice/yehaocheng_20160120/08.eggnog/bin/profile2list.pl %s %s/../06.gene_profile/gene.profile; cd -"%\ (work_dir_01,subgroup,work_dir)) commands.append("ls %s/*gene.list | sed 's/.gene.list//g'|while read a; do perl %s/04_get_countlist.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt $a.gene.list $a.eggnog.tab;done"%\ (work_dir_01,bin_eggnog_default_dir,work_dir)) commands.append("ls %s/*.eggnog.tab | sed 's/.eggnog.tab//g' | while read a;do cut -f 3,4 $a.eggnog.tab > $a.eggnog.count.tab; done"%\ (work_dir_01)) commands.append("ls %s/*.eggnog.count.tab | /data_center_03/USER/zhongwd/bin/profile - > %s/eggnog.count.tab"%\ (work_dir_01,work_dir_01)) commands.append("cd %s;Rscript /data_center_04/Projects/pichongbingdu/pair_reads/08.eggnog/NOG.R eggnog.count.tab;cd -"%\ (work_dir_01)) commands.append("convert -density 300 %s/NOG.pdf %s/NOG.png" % (work_dir_01, work_dir_01)) print gettime("end 08.eggnog") return commands
def ardb(config, name): print gettime("start 09.ardb") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("cat blat/* > all.m8") commands.append("pick_blast_m8 all.m8 > ardb.m8") commands.append("cut -f 2 ardb.m8 | search - /data_center_03/Project/AS/16_ARDB/old/ardbAnno1.0_modify_db07/tabs/ardb.tab | paste ardb.m8 - | cut -f 1,13- > gene2ardb.tsv") commands.append("classprofile -i gene2ardb.tsv -p ../06.gene_profile/gene.profile -f 3 > ardb.type.profile") commands.append("classprofile -i gene2ardb.tsv -p ../06.gene_profile/gene.profile -f 4 > ardb.class.profile") commands.append("Rscript /data_center_07/Project/RY2015K16A01-1/09.ardb/bin/ardb.barplot.r\n") commands.append("(echo -e 'Gene ID\tProtein name\tType\tClass\tDescription'; cat gene2ardb.tsv) > ardb.anno.tsv") # groups config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,\s*|,\t+",config_gene.get("param","group")) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group(subgroup) commands.append("## ----------------------------------%s----------------------"%(subgroup_name)) # diff work_dir_901 = "%s/group/%s/01.class_diff/" % (work_dir,subgroup_name) mkdir(work_dir_901) work_dir_902 = "%s/group/%s/02.type_diff/" % (work_dir,subgroup_name) mkdir(work_dir_902) commands.append("#01 diff class") commands.append(command_default + "python %s/t08_diff.py -i %s/ardb.class.profile -g %s -o %s" % (tool_default_dir, work_dir,subgroup, work_dir_901)) commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_901,work_dir_901,subgroup,work_dir_901)) commands.append("#02 diff type") commands.append(command_default + "python %s/t08_diff.py -i %s/ardb.class.profile -g %s -o %s" % (tool_default_dir, work_dir,subgroup, work_dir_902)) commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_902,work_dir_902,subgroup,work_dir_902)) commands.append("#03 function_barplot") commands.append(command_default + "Rscript %s/710_level1_barplot.R %s/ardb.class.profile %s/group/%s/ardb.class.pdf Class %s"\ % (bin_ardb_default_dir, work_dir, work_dir, subgroup_name, subgroup)) commands.append("convert -density 300 %s/group/%s/ardb.class.pdf %s/group/%s/ardb.class.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) commands.append(command_default + "Rscript %s/710_level1_barplot.R %s/ardb.type.profile %s/group/%s/ardb.type.pdf Type %s"\ % (bin_ardb_default_dir, work_dir, work_dir, subgroup_name, subgroup)) commands.append("convert -density 300 %s/group/%s/ardb.type.pdf %s/group/%s/ardb.type.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) if group_num==2: commands.append("#04 dimond swarm") commands.append(command_default + "Rscript %s/dimond_swarm.R %s/ardb.type.profile %s %s/group/%s/dimond_swarm.pdf"\ % (bin_ardb_default_dir, work_dir, subgroup, work_dir, subgroup_name)) commands.append("convert -density 300 %s/group/%s/dimond_swarm.pdf %s/group/%s/dimond_swarm.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) commands.append("#05 top ardb") commands.append(command_default + "Rscript %s/top_ardb.R %s/ardb.type.profile %s %s/group/%s/top_ardb.pdf"\ % (bin_ardb_default_dir, work_dir, subgroup, work_dir, subgroup_name)) commands.append("convert -density 300 %s/group/%s/top_ardb.pdf %s/group/%s/top_ardb.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) print gettime("end 009.ardb") return commands
def ardb_pre(config, name): print gettime("start 09.ardb_pre") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("cp /data_center_03/Project/AS/16_ARDB/db.list ./") commands.append("perl %s/blatprot.pl db.list %s/../05.gene_catalog/gene_catalog.split.list %s/"%(tool_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 5G --jobs 10 --prefix AR --lines 1 --getmem shell/blat.sh &") print gettime("end 09.ardb_pre") return commands
def kegg_pre(config, name): print gettime("start 07.kegg_pre") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("perl %s/blatprot.pl /data_center_01/home/NEOLINE/zwd/project/PMO/LiuLin-ascites-stool/07.kegg/db.list %s/../05.gene_catalog/gene_catalog.split.list %s/"\ %(tool_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix KEGG --lines 1 shell/blat.sh &") print gettime("end 07.kegg_pre") return commands
def clean_reads(config, name): print gettime("start raw_reads") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s' % (main_dir, name) mkdir(work_dir) commands.append('nohup python %s/merge.py -l %s/material/sample.list -c %s/ &' %\ (bin_defdir, main_dir, work_dir)) commands.append('awk -F "\\t" \'{print $1"\\t"$2"\\t"$3"\\t"$4"\\t"$5"\\t"$6"\\t"$7}\' %s/00.raw_reads/qc_*.stat.tsv > %s/qc_stat.tsv' %\ (main_dir, work_dir)) print gettime("end raw_reads") return commands
def gene_catalog_pre(config, name): commands = [] print gettime("start 05.gene_catalog_pre") work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## build gene catalog") commands.append("cat %s/../04.gene_predict/gene/*.fna > %s/redundant.gene_catalog.fna"\ %(work_dir,work_dir)) commands.append("perl %s/cd-hit.pl %s/redundant.gene_catalog.fna %s/gene_catalog.fna 20"\ %(bin_gene_catalog_default_dir,work_dir,work_dir)) print gettime("end 05.gene_catalog_pre") return commands
def cazy_pre(config, name): print gettime("start 12.cazy_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("perl %s/blatprot.pl /data_center_09/Project/lixr/00.DATA/CAZY_DB/db.list %s/../05.gene_catalog/gene_catalog.split.list %s"\ % (tools_dir, work_dir, work_dir)) commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix CAZY --lines 1 shell/blat.sh &" ) print gettime("end 12.cazy_pre") return commands
def eggnog_pre(config, name): print gettime("end 08.eggnog_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append( "perl %s/blatprot.pl /data_center_06/Project/pracrice/yehaocheng_20160120/08.eggnog/db.list %s/../05.gene_catalog/gene_catalog.split.list %s" % (tool_default_dir, work_dir, work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix NOG --lines 1 %s/shell/blat.sh &"\ %work_dir) print gettime("end 08.eggnog_pre") return commands
def gene_predict_pre(config, name): print gettime("start 04.gene_predict_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") mkdir(work_dir) commands.append("## gene_predict") commands.append("perl %s/GenePredict.pl -s %s/../03.assembly/scaftigs.list -l 100 -d %s"\ %(bin_gene_predict_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 1G --jobs 10 --prefix GP --lines 2 %s/shell/predict.sh &"\ %work_dir) print gettime("end 04.gene_predict_pre") return commands
def html(config, sh_file, name): print gettime("start html") commands = [] work_dir = '%s/' % (os.path.dirname(sh_file)) if glob.glob(work_dir+'/*'): os.system('rm -rf %s/*' % work_dir) #获取分组名称 config_gene = ConfigParser() config_gene.read(config) groups = re.split("\s+|\t|,\t+|,\s*", config_gene.get("html","group")) group_dir = config_gene.get("param","group_dir").strip() #准备配置文件 os.system("mkdir -p %s/result/result/" % work_dir) os.system("mkdir -p %s/result/html/" % work_dir) os.system("cp -r %s/json_structure/html_material/ %s/result/html/" % (bin_html_default_dir, work_dir)) os.system("mkdir %s/data/" % work_dir) os.system("mkdir %s/result_structure/" % work_dir) os.system("mkdir %s/html_structure/" % work_dir) os.system("mkdir %s/json_structure/" % work_dir) os.system("cp %s %s/result_structure/" % (const.result_structure, work_dir)) os.system("cp %s %s/html_structure/" % (const.html_structure, work_dir)) os.system("cp %s %s/json_structure/" % (const.json_structure, work_dir)) commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/result_structure/check_result_structure.py -g %s -c %s/result_structure/result_structure -o %s/ -so %s/result_structure/result_structure.new"\ % (bin_html_default_dir, config, work_dir, work_dir, work_dir)) commands.append("# 复制标准结果额外的文件夹\n# /data_center_01/home/mas/python3.6/bin/python3 %s/result_structure/check_result_structure.py -g %s -c %s/result_structure/result_structure -o %s/ -so %s/result_structure/result_structure.new -eo %s/result_structure/result_structure.extra"\ % (bin_html_default_dir, config, work_dir, work_dir, work_dir, work_dir)) commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/result_structure/cp_result_structure.py -c %s/result_structure/result_structure.new -so %s/result/result/ -do %s/data/"\ % (bin_html_default_dir, work_dir, work_dir, work_dir)) commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/html_structure/check_html_structure.py -c %s/html_structure/html_structure -p %s -o %s/html_structure/html_config/ -os html_structure -g %s"\ % (bin_html_default_dir, work_dir, config, work_dir, group_dir)) for group in groups: commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/html_structure/cp_html_structure.py -c %s/html_structure/html_config/%s_html_structure -o %s/result/html/html_material/images/%s/"\ % (bin_html_default_dir, work_dir, group, work_dir, group)) commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/json_structure/00.getJson.py -p %s -c %s/json_structure/json_structure -g %s -o %s/json_structure/json_structure.json -r %s/result/html/html_material/images/"\ % (bin_html_default_dir, config, work_dir, group_dir, work_dir, work_dir)) commands.append("/data_center_01/home/mas/python3.6/bin/python3 %s/json_structure/parse_html.py -j %s/json_structure/json_structure.json -t %s/json_structure/html_templates/ -o %s/result/html/"\ % (bin_html_default_dir, work_dir, bin_html_default_dir, work_dir)) print gettime("end html") return commands
def cag(config, name): print gettime("stat 11.cag") work_dir = '%s/%s' % (os.path.dirname(config), name) material_dir = '%s/material' % os.path.dirname(config) if os.path.isdir(work_dir): pass else: mkdir(work_dir) config_group = ConfigParser() config_group.read(config) group = re.split('\s+|,\s*|\t+|,\t*|', config_group.get('param', 'group')) for (i, subgroup_name) in enumerate(group): #subgroup_filename = '0' + str((i+1)) + '.' + subgroup_name subgroup_filename = subgroup_name mkdir("%s/%s" % (work_dir, subgroup_filename)) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( "%s/%s_group.list" % (material_dir, subgroup_name)) if sample_num_total < 20: log = open( "%s/%s/Sample_not_enough.log" % (work_dir, subgroup_filename), "w+") log.write("The minimum sample size (20) is not met.") log.close else: grp_sh = [] os.system( "cp %s/%s_group.list %s/%s/group.list" % (material_dir, subgroup_name, work_dir, subgroup_filename)) grp_sh.append( "python %s/full_CAG.py -p %s/../06.gene_profile/gene.profile -d %s/%s -g %s/%s/group.list" % (bin_cag_default_dir, work_dir, work_dir, subgroup_filename, work_dir, subgroup_filename)) grp_sh.append( "python %s/cag_taxonomy.py -i %s/%s/outfile/cag -g %s/../05.gene_catalog/gene_catalog.fna -o %s/%s/taxonomy/" % (bin_cag_default_dir, work_dir, subgroup_filename, work_dir, work_dir, subgroup_filename)) grp_sh.append("python %s/cag_exe_sequence.py -d %s/%s" % (bin_cag_default_dir, work_dir, subgroup_filename)) grp_sh.append("\n") with open('%s/%s/cag_pre.sh' % (work_dir, subgroup_filename), 'w') as outf: outf.write('\n'.join(grp_sh)) print gettime("end 11.cag")
def raw_reads(config, name): print gettime("start raw_reads") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s' % (main_dir, name) mkdir(work_dir) commands.append('python %s/QC_main.py -b %s/material/batch.list -c %s/material/config.list -p %s' %\ (bin_default_dir, main_dir, main_dir, config)) # commands.append("## Q20 Q30") # commands.append('cp %s/pipeline.cfg %s/pipeline.cfg' % (main_dir,work_dir)) # commands.append('# nohup python %s/Q20_Q30_stat.py -b %s/material/batch.list -c %s/pipeline.cfg -o . &' %\ # (bin_default_dir, main_dir, work_dir)) # commands.append('python %s/Q20_Q30_stat_python2_new.py -b %s/material/batch.list -c %s/pipeline.cfg -o . ' %\ # (bin_default_dir, main_dir, main_dir)) print gettime("end raw_reads") return commands
def use_snakemake_method(config, name): print gettime('start create snakemake step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/snakemake_method' % (main_dir, name) mkdir(work_dir) # updata config.yaml with open(const.config_yaml,'r') as inf: data = yaml.load(inf) data['clean_reads_dir'] = '%s/01.clean_reads' % main_dir data['clean_reads_list'] = '%s/clean_reads.list' % work_dir data['outdir'] = '%s/alignment' % work_dir with open('%s/config.yaml' % work_dir, 'w') as outf: yaml.dump(data, outf, default_flow_style=False) # update cluster.yaml with open(const.cluster_yaml,'r') as inf: data = yaml.load(inf) data['__default__']['qsublog'] = '%s/log/' % work_dir data['align']['qsublog'] = '%s/log/align/' % work_dir data['abund']['qsublog'] = '%s/log/abund/' % work_dir data['abund_profile']['qsublog'] = '%s/log/' % work_dir with open('%s/cluster.yaml' % work_dir,'w') as outf: yaml.dump(data, outf, default_flow_style=False) # prepare file os.system('cp %s %s/Snakefile' % (const.snakemake, work_dir)) mkdir('%s/log/align/'%work_dir,'%s/log/abund/'%work_dir, '%s/profile'%work_dir) commands.append('cp %s/01.clean_reads/clean_reads.list .' % main_dir) commands.append('## calculate abundance') commands.append('source activate /data_center_03/USER/huangy/soft/MAIN/anaconda2/envs/gutbio') commands.append('snakemake --cluster-config cluster.yaml --cluster \'qsub -o {cluster.qsublog} -e {cluster.qsublog} -l vf={cluster.vf} -q {cluster.queue}\' -j 40 --nolock') commands.append('source deactivate') commands.append('## form species profile') commands.append('ls alignment/*/*root.abundance >abund.list') #commands.append('python %s/02_taxonomy.py -d . -c ../%s/qc_%s.stat.tsv' % (bin_dir, raw_dir_name, batch_num)) commands.append('python %s/02_taxonomy.py -i abund.list' % bin_dir) commands.append('rm abund.list') commands.append('for i in all phylum class order family genus species; do ls alignment/*/*$i.abundance |perl %s/201_profile - >profile/$i.profile; done' % bin_dir) commands.append('num=1;for i in phylum class order family genus species; do let num=num+1; python %s/201_profile_convert.py -i profile/$i.profile -o profile/otu_table_L$num.txt; done' % bin_dir) commands.append('ls profile/* | while read a; do cp $a ../../taxon_profile; done') commands.append('## reads use rate') commands.append('#ls alignment/*/*.MATCH.logs >match_logs.list') commands.append('#python %s/201_use_rate.py -i match_logs.list -o use_rate.stat.tsv -clean %s/00.raw_reads/qc_stat.tsv' % (bin_dir, main_dir)) commands.append('#rm match_logs.list') return work_dir, commands
def assembly_soapdenove(config, name): print gettime("start 03.assembly soapdenove method") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/soapdenove' % (main_dir, name) # beginning assembly file commands.append("## best contigs") # commands.append('ls assembly/*/*/*scafSeq |while read a; do perl /data_center_06/Project/LiuLin-ascites-stool/03.assembly/bin/scaftigs.pl $a 500 ${a%%.*}.scaftigs.fna ${a%%.*}.scaftigs.stat; done') commands.append( 'ls assembly/*/*/*scafSeq |while read a; do perl %s/scaftigs.pl $a 500 ${a%%.*}.scaftigs.fna ${a%%.*}.scaftigs.stat; done' % bin_dir) commands.append( "/data_center_03/USER/zhongwd/bin/list assembly/*/* >%s/list.txt" % work_dir) commands.append( "python %s/best_scaftigs_selecter.py -i %s/list.txt -o %s/best_scaftigs" % (bin_dir, work_dir, work_dir)) commands.append("rm %s/list.txt" % work_dir) #commands.append("/data_center_03/USER/zhongwd/bin/list best_scaftigs/*stat | perl /data_center_07/Project/RY2015K16A01-1/03.assembly/bin/stat.pl > %s/scaftigs.best.stat.tsv" % work_dir) commands.append( "/data_center_03/USER/zhongwd/bin/list best_scaftigs/*stat | perl %s/stat.pl > %s/scaftigs.best.stat.tsv" % (bin_dir, work_dir)) commands.append("## histogram") mkdir("%s/histogram/" % work_dir) commands.append( "ls best_scaftigs/*.scaftigs.fna | sed 's#best_scaftigs/\(.*\).fna#\\1#g' | while read a; do lengthfasta best_scaftigs/$a.fna >histogram/$a.length; done" ) commands.append( "ls histogram/*.scaftigs.length |while read a; do Rscript %s/scaftigs_length.R $a ${a%%.*}.histogram.pdf; done" % bin_dir) commands.append( "ls histogram/*.pdf |while read a; do convert -density 300 $a ${a%%.*}.png; done" ) commands.append("## upload") commands.append( "ls best_scaftigs/*fna |while read a ; do gzip -c $a >${a%%.*}.fna.gz; done" ) commands.append("md5sum best_scaftigs/*.gz > best_scaftigs/scaftigs.md5") commands.append( 'ls best_scaftigs/*scaftigs.fna | while read a;do b=${a##*/};echo -e "${b%%.*}\\t`pwd $a`/$a";done > ../../scaftigs.list' ) print gettime("end 03.assembly") return work_dir, commands
def use_megahit_version(config, name): print gettime('start create megahit step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/megahit' % (main_dir, name) mkdir(work_dir) # prepare assembly file commands.append("## assembly") commands.append("cp %s/01.clean_reads/clean_reads.list %s" % (main_dir, work_dir)) commands.append( "perl %s/megahit_shell_maker.pl -l clean_reads.list -d %s" % (bin_dir, work_dir) ) # 参考:/data_center_11/Project/wenpp/01.wujianrong_20180822/03.assembly/assembly_megahit commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue neo.q --memery 30G --jobs 2 --lines 1 --prefix megahit shell/assembly.sh &" ) print gettime("end assembly_pre") return work_dir, commands
def use_soapdenove_method(config, name): print gettime('start create soapdenove step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/soapdenove' % (main_dir, name) mkdir(work_dir) # prepare assembly file config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") commands.append("## assembly") commands.append("cp %s/01.clean_reads/clean_reads.list %s/" % (main_dir, work_dir)) #commands.append("perl /data_center_03/USER/zhongwd/rd/12_soap_denovo/soapdenovo_shell_maker.pl -l clean_reads.list -i %s -minkmer 51 -maxkmer 63 -b 4 -d %s/"%( ins_list,work_dir)) commands.append( "perl %s/soapdenovo_shell_maker.pl -l clean_reads.list -i %s -minkmer 51 -maxkmer 63 -b 4 -d %s/" % (bin_dir, ins_list, work_dir)) commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue big.q:all.q:all.q:all.q --memery 100G:5G:10G:3G --jobs 2 --lines 4 --prefix AS shell/assembly.sh &" ) print gettime("end assembly_pre") return work_dir, commands
def gene_profile(config, name): print gettime("start 06.gene_profile") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) commands.append("## calculate gene abundance") commands.append("ls %s/alignment/*/*abundance |perl %s/profile.pl - > %s/gene.profile"\ %(work_dir,tool_default_dir,work_dir)) commands.append("## 01.alpha diversity") mkdir("%s/01.alpha_diversity/" % work_dir) commands.append(command_default + "perl %s/shannon.pl %s/gene.profile %s/01.alpha_diversity/gene.alpha.div.tsv"\ %(tool_default_dir,work_dir,work_dir)) config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t+|,\s*|,\t+", config_gene.get("param", "group")) mkdir("%s/group" % work_dir) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( subgroup) commands.append( "## ----------------------------------%s----------------------" % (subgroup_name)) work_dir_601 = "%s/group/%s/01.alpha_diversity" % (work_dir, subgroup_name) mkdir(work_dir_601) commands.append("##01.alpha diversity") commands.append(command_default + "Rscript %s/gene.alpha.div.R %s/01.alpha_diversity/gene.alpha.div.tsv %s %s/gene.alpha.div.pdf"\ %(bin_gene_profile_default_dir,work_dir,subgroup,work_dir_601)) commands.append("convert -density 300 %s/gene.alpha.div.pdf %s/gene.alpha.div.png"\ %(work_dir_601,work_dir_601)) # 2018.10.25新增加 commands.append(command_default + "python %s/gene.alpha.div.py -i %s/01.alpha_diversity/gene.alpha.div.tsv -g %s -o %s/gene.num.tvs"\ %(bin_gene_profile_default_dir,work_dir,subgroup,work_dir_601)) commands.append(command_default + "Rscript %s/01.alpha_diversity.gene.num.R %s/gene.num.tvs %s/gene.num.pdf"\ %(bin_gene_profile_default_dir,work_dir_601,work_dir_601)) commands.append("convert -density 300 %s/gene.num.pdf %s/gene.num.png"\ %(work_dir_601,work_dir_601)) work_dir_602 = "%s/group/%s/02.anosim" % (work_dir, subgroup_name) mkdir(work_dir_602) commands.append("##02.anosim") commands.append(command_default + "python %s/t04_anosim.py -i %s/gene.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_602)) # commands.extend(share_mothod(tool_default_dir,work_dir,"gene.profile",subgroup,subgroup_name,\ # numlist=["02","03","04","05","06","07","08","09"])) # commands.append("##03.LefSe") # work_dir_603 = "%s/group/%s/03.LEfSe/" % (work_dir,subgroup_name) # mkdir(work_dir_603) # commands.append(command_default + "python %s/603_LEfSe.py -i %s/gene.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g %s -o %s --LDA 2"\ # %(bin_gene_profile_default_dir,work_dir,subgroup,work_dir_603)) #becose Error: protect(): protection stack overflow commands.append("##03.diff") work_dir_603 = "%s/group/%s/03.diff" % (work_dir, subgroup_name) mkdir(work_dir_603) commands.append(command_default + "python %s/t08_diff.py -i %s/gene.profile -g %s -o %s/"\ %(tool_default_dir,work_dir,subgroup,work_dir_603)) commands.append( '''awk -F "\\t" '{print $1"\\t"$7"\\t"$8"\\t"$9}' %s/diff.marker.filter.tsv|sed '1a Gene ID\\tP-value\\tQ-value\\tGroup'|sed '1d' > %s/diff.stat.tsv''' % (work_dir_603, work_dir_603)) commands.append("##03.diff/diff_boxplot") commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_603,work_dir_603,subgroup,work_dir_603)) commands.append("## diff_qvalue") commands.append("Rscript %s/qvalue.R %s/diff.marker.tsv %s/qvalue.pdf"\ %(bin_gene_profile_default_dir,work_dir_603,work_dir_603)) commands.append("convert -density 300 %s/qvalue.pdf %s/qvalue.png"\ %(work_dir_603,work_dir_603)) # if sample_num_in_groups>5 and sample_num_total>20 and group_num==2: # work_dir_604 = "%s/group/%s/04.mgs"%(work_dir,subgroup_name) # mkdir(work_dir_604) # #os.system("cp %s/MGS.V2.0/MGS.cfg %s/MGS.cfg"%(const.bin_default_dir,work_dir_604)) # commands.append("python %s/full_MGS.py -p %s/gene.profile -d %s -g %s --threshold 0"%\ # (bin_mgs_default_dir,work_dir,work_dir_604,subgroup)) # mkdir("%s/taxonomy/"%work_dir_604) # commands.append("python %s/mgs_taxonomy.py -i %s/pathway/ -g %s/../05.gene_catalog/gene_catalog.fna -o %s/taxonomy/ --group %s"\ # %(bin_mgs_default_dir,work_dir_604,work_dir,work_dir_604,subgroup)) # #TODO mgs # if sample_num_in_groups>5 and sample_num_total>20: # work_dir_605 = "%s/group/%s/05.cag" % (work_dir,subgroup_name) # mkdir(work_dir_605) # #os.system("cp %s/CAG.V1.0/CAG.cfg %s/CAG.cfg"%(const.bin_default_dir,work_dir_605)) # commands.append("python %s/full_CAG.py -p %s/gene.profile -d %s -g %s "%\ # (bin_cag_default_dir,work_dir,work_dir_605,subgroup)) # mkdir("%s/taxonomy"%work_dir_605) # commands.append("python %s/cag_taxonomy.py -i %s/outfile/cag -g %s/../05.gene_catalog/gene_catalog.fna -o %s/taxonomy/"\ # %(bin_cag_default_dir,work_dir_605,work_dir,work_dir_605)) # #TODO cag print gettime("end 06.gene_profile") return commands
def taxon(config, name): print gettime('start create nalysis step script') samples(config, name) group(config, name)
def read_params(args): parsers = argparse.ArgumentParser( description='''The initial run script of metagene ''') parsers.add_argument('--config', dest='config_path', metavar='FILE', type=str, required=True, help="config file for metagenome pipeline") args = parsers.parse_args() return args if __name__ == '__main__': print gettime("start") step_names_order = const.step_names_order params = read_params(sys.argv) config_path = params.config_path # 配置文件名称 # print config_path config = ConfigParser.ConfigParser() # 增加修改配置文件的类 config.read(config_path) # 读取配置文件 # option_value = config.read_config() # 将配置文件的内容添加字典中 work_dir = config.get('param', 'work_dir') step_names = re.split(',\s*|,\t+|\t|\s+', config.get('step', 'step_names_order')) step_names_all = step_names_order.split(",") steps = [] for i, name in enumerate(step_names): if name in step_names_all: print gettime("start create %s step script" % name)
def kegg(config, name): print gettime("start 07.kegg") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) commands.append("## whole kegg analysis") commands.append("rm %s/blat/all.m8"%work_dir) commands.append("cat %s/blat/* > %s/blat/all.m8"%(work_dir,work_dir)) commands.append(command_default + "python %s/701_pick_blast_m8.py -i %s/blat/all.m8 -o %s/kegg.m8"%\ (bin_kegg_default_dir,work_dir,work_dir)) commands.append(command_default + "perl %s/prokaryote.annotation.pl < %s/kegg.m8 > %s/kegg.anno.tsv"%\ (bin_kegg_default_dir,work_dir,work_dir)) commands.append(command_default + "cut -f2 %s/kegg.m8|sort|uniq >%s/sort_uniq_m8.list"%\ (work_dir,work_dir)) commands.append(command_default + "python %s/702_blast2ko_v2.py -i %s/kegg.m8 -o %s/gene_catalog.ko --subjectId %s/sort_uniq_m8.list"%\ (bin_kegg_default_dir,work_dir,work_dir,work_dir)) commands.append(command_default + "perl /data_center_02/Database/KEGG/bin/07_keggMap_nodiff.pl -ko %s/gene_catalog.ko -outdir %s/gene_catalog.map"%\ (work_dir,work_dir)) commands.append(command_default + "perl /data_center_02/Database/KEGG/bin/06_pathfind.pl -fg %s/gene_catalog.ko -output %s/gene_catalog.path -cutoff 0.0"%\ (work_dir,work_dir)) commands.append(command_default + "perl %s/10_KEGG_class.pl %s/gene_catalog.path gene_catalog.path"%\ (bin_kegg_default_dir,work_dir)) # commands.append("## group analysis") # commands.append("mkdir 07.kegg/") # commands.append("perl core.pl group.list 06.gene_profile/species.profile > 06.gene_profile/core.profile") # # commands.append("## diff analysis") # commands.append("mkdir 07.kegg/diff_gene") # commands.append("## ko profile") commands.append(command_default + "python %s/04_get_profiling_ko.py -i %s/gene_catalog.ko --gene_profile %s/../06.gene_profile/gene.profile -o %s/ko.profile"%\ (bin_kegg_default_dir,work_dir,work_dir,work_dir)) # commands.append("## diff ko") # # commands.append("## diff module(pathway)") # commands.append("## 701 kegg功能统计") work_dir_701 = "%s/01.kegg_class/"%work_dir mkdir(work_dir_701) commands.append(command_default + "Rscript %s/701_KEGG_class.R %s/gene_catalog.path.class %s/701_KEGG_class.pdf"\ %(bin_kegg_default_dir,work_dir,work_dir_701)) commands.append("convert -density 300 %s/701_KEGG_class.pdf %s/701_KEGG_class.png"\ %(work_dir_701,work_dir_701)) ##function commands.append("## 712.function_barplot") work_dir_712 = "%s/12.functional_barplot"%work_dir mkdir(work_dir_712) commands.append(command_default + "python %s/04_get_profiling_level1.py -i %s/gene_catalog.path -k %s/ko.profile -o %s"\ %(bin_kegg_default_dir,work_dir,work_dir,work_dir_712)) commands.append("Rscript %s/710_level1_barplot.R %s/kegg_level1_profile.txt %s/level1_barplot.pdf 1"\ %(bin_kegg_default_dir,work_dir_712,work_dir_712)) commands.append("convert -density 300 %s/level1_barplot.pdf %s/level1_barplot.png"\ %(work_dir_712,work_dir_712)) commands.append("Rscript %s/710_level1_barplot.R %s/kegg_level2_profile.txt %s/level2_barplot.pdf 2"\ %(bin_kegg_default_dir,work_dir_712,work_dir_712)) commands.append("convert -density 300 %s/level2_barplot.pdf %s/level2_barplot.png"\ %(work_dir_712,work_dir_712)) ##diff config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,\s*|,\t+",config_gene.get("param","group")) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group(subgroup) commands.append("## ----------------------------------%s----------------------"%(subgroup_name)) # heatmap & pca & pcoa work_dir_702 = "%s/group/%s/02.heatmap/"%(work_dir,subgroup_name) mkdir(work_dir_702) work_dir_703 = "%s/group/%s/03.pca/"%(work_dir,subgroup_name) mkdir(work_dir_703) work_dir_704 = "%s/group/%s/04.pcoa/"%(work_dir,subgroup_name) mkdir(work_dir_704) if sample_num_total>=5: commands.append("##heatmap") commands.append(command_default + "python %s/t06_heatmap.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_702)) commands.append("##pca") commands.append(command_default + "python %s/t01_pca.py -i %s/ko.profile -g %s -o %s --with_boxplot "\ %(tool_default_dir,work_dir,subgroup,work_dir_703)) commands.append("##pcoa") commands.append(command_default + "python %s/t02_pcoa.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_704)) else: log = "The minimum sample size (5) is not met." samp_num_enough(work_dir_702,log) samp_num_enough(work_dir_703,log) samp_num_enough(work_dir_704,log) # nmds & anosim & adonis & mrpp work_dir_705 = "%s/group/%s/05.nmds/"%(work_dir,subgroup_name) mkdir(work_dir_705) work_dir_706 = "%s/group/%s/06.anosim/"%(work_dir,subgroup_name) mkdir(work_dir_706) work_dir_706_1 = "%s/group/%s/07.adonis/"%(work_dir,subgroup_name) mkdir(work_dir_706_1) work_dir_707 = "%s/group/%s/08.mrpp/"%(work_dir,subgroup_name) mkdir(work_dir_707) if min_sample_num_in_groups>=5: commands.append("##nmds") commands.append(command_default + "python %s/t03_nmds.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_705)) commands.append("##anosim") commands.append(command_default + "python %s/t04_anosim.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_706)) commands.append("##adonis") commands.append(command_default + "python %s/t12_adonis_pca.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_706_1)) commands.append(command_default + "python %s/t12_adonis_pcoa.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_706_1)) commands.append("##mrpp") commands.append(command_default + "python %s/t05_mrpp.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_707)) else: log = "min_sample_num_in_groups >= 5" samp_num_enough(work_dir_705,log) samp_num_enough(work_dir_706,log) samp_num_enough(work_dir_706_1,log) samp_num_enough(work_dir_707,log) # flower|venn if group_num>=6 and group_num<30: work_dir_708_1 = "%s/group/%s/09.flower/"%(work_dir,subgroup_name) mkdir(work_dir_708_1) commands.append("##flower") commands.append(command_default + "perl %s/t07_flower.pl %s/ko.profile %s %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_708_1)) elif group_num>=2 and group_num<6: work_dir_708_2 = "%s/group/%s/09.venn/"%(work_dir,subgroup_name) mkdir(work_dir_708_2) commands.append("##venn") commands.append(command_default + "python %s/t07_venn_flower.py -i %s/ko.profile -o %s -g %s --with_group "%\ (tool_default_dir,work_dir,work_dir_708_2,subgroup)) # ko_wilcoxon & ko_lefse work_dir_709 = "%s/group/%s/10.ko_wilcoxon/"%(work_dir,subgroup_name) mkdir(work_dir_709) work_dir_710 = "%s/group/%s/11.ko_lefse"%(work_dir,subgroup_name) mkdir(work_dir_710) if min_sample_num_in_groups>=5: # work_dir_709 = "%s/group/%s/09.ko_wilcoxon/"%(work_dir,subgroup_name) # mkdir(work_dir_709) commands.append("##09.0 diff") commands.append(command_default + "python %s/t08_diff.py -i %s/ko.profile -g %s -o %s"\ %(tool_default_dir,work_dir,subgroup,work_dir_709)) commands.append(command_default + "python %s/ko_description.py -i %s/diff.marker.filter.tsv -o %s/diff.marker.filter.definition.tsv --ko_def /data_center_09/Project/lixr/00.DATA/KEGG_DB/ko_description.tab"%(bin_kegg_default_dir,work_dir_709,work_dir_709)) commands.append("#09.1 diff boxplot") commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_709,work_dir_709,subgroup,work_dir_709)) commands.append("#09.2 diff heatmap") commands.append(command_default + "python %s/t06_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ %(tool_default_dir,work_dir_709,subgroup,work_dir_709)) commands.append("#09.3 diff pathway") mkdir("%s/pathway/"%work_dir_709) commands.append("Rscript %s/707_compare_pathway.R %s/diff.marker.filter.profile.tsv %s %s/pathway/707_compare.txt"\ %(bin_kegg_default_dir,work_dir_709,subgroup,work_dir_709)) commands.append("#09.4 diff detail pathway") work_dir_709_2 = "%s/detail_pathway/"%work_dir_709 mkdir(work_dir_709_2) commands.append("python %s/709_diff_map.py -i %s/diff.marker.filter.tsv -ko %s/gene_catalog.ko -g %s -o %s "\ %(bin_kegg_default_dir,work_dir_709,work_dir,subgroup,work_dir_709_2)) commands.append("#09.5 diff barplot") #commands.append("cut -f2 %s|uniq|less|while read a ;do grep \"$a\" %s|cut -f1 > %s/$a.list;done" % (subgroup,subgroup,work_dir_709)) commands.append("cut -f 1 %s/diff.marker.filter.tsv |sed -n '2,$p' |while read line;do grep \"$line\" %s/gene_catalog.path >>%s/diff_gene_catalog.path;done"\ %(work_dir_709,work_dir,work_dir_709)) commands.append("sort %s/diff_gene_catalog.path |uniq > %s/diff_gene_catalog2.path"%(work_dir_709,work_dir_709)) commands.append("rm %s/diff_gene_catalog.path"%work_dir_709) mkdir("%s/path_barplot/"%work_dir_709) commands.append("python %s/712_ko2path_bar.py -i %s/diff_gene_catalog2.path -g %s/diff.marker.filter.tsv -o %s/path_barplot/ -l 2"\ %(bin_kegg_default_dir,work_dir_709,work_dir_709,work_dir_709)) ## ko_lefse commands.append("## lefse") commands.append(command_default + "python %s/../06.gene_profile/603_LEfSe.py -i %s/ko.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g %s -o %s --LDA 2"\ %(bin_kegg_default_dir,work_dir,subgroup,work_dir_710)) commands.append("#lefse heatmap") mkdir("%s/heatmap/"%work_dir_710) commands.append(command_default + "python %s/t06_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ %(tool_default_dir,work_dir_710,subgroup,work_dir_710)) commands.append("#lefse pathway") mkdir("%s/pathway/"%work_dir_710) commands.append("Rscript %s/707_compare_pathway.R %s/diff.marker.filter.profile.tsv %s %s/pathway/707_compare.txt"\ %(bin_kegg_default_dir,work_dir_710,subgroup,work_dir_710)) commands.append("#lefse barplot") commands.append("cut -f 1 %s/diff.marker.filter.tsv |while read line;do grep \"$line\" %s/gene_catalog.path >>%s/diff_gene_catalog.path;done"\ %(work_dir_710,work_dir,work_dir_710)) commands.append("sort %s/diff_gene_catalog.path |uniq > %s/diff_gene_catalog2.path"%(work_dir_710,work_dir_710)) commands.append("rm %s/diff_gene_catalog.path"%work_dir_710) mkdir("%s/path_barplot/"%work_dir_710) commands.append("python %s/712_ko2path_bar.py -i %s/diff_gene_catalog2.path -g %s/diff.marker.filter.tsv -o %s/path_barplot/ -l 2"\ %(bin_kegg_default_dir,work_dir_710,work_dir_710,work_dir_710)) # lefse detail pathway work_dir_710_2 = "%s/detail_pathway/"%work_dir_710 mkdir(work_dir_710_2) if group_num==2: commands.append("#lefse detail pathway") commands.append("python %s/709_diff_map.py -i %s/diff.marker.filter.tsv -ko %s/gene_catalog.ko -g %s -o %s "\ %(bin_kegg_default_dir,work_dir_710,work_dir,subgroup,work_dir_710_2)) else: log = "The number of groups must be 2." samp_num_enough(work_dir_710_2,log) else: log = "min_sample_num_in_groups >= 5" samp_num_enough(work_dir_709,log) samp_num_enough(work_dir_710,log) # ko_metastats # if group_num==2 and min_sample_num_in_groups>=5: # work_dir_711 = "%s/group/%s/11.ko_metastats/"%(work_dir,subgroup_name) # mkdir(work_dir_711) # commands.append("##metastats") # commands.append("python %s/708_sample2profile.py -i %s/ko.profile -g %s -o %s -f for_metastats.profile --num 1"\ # %(bin_kegg_default_dir,work_dir,subgroup,work_dir_711)) # commands.append(command_default + "Rscript %s/708_metastats.R %s/for_metastats.profile %s %s/ XX 0.05 TRUE"\ # %(bin_kegg_default_dir,work_dir_711,subgroup,work_dir_711)) # commands.append("convert -density 300 %s/708_metastats_boxplot.pdf %s/708_metastats_boxplot.png"\ # %(work_dir_711,work_dir_711)) # commands.append("# diff heatmap") # mkdir("%s/heatmap/"%work_dir_711) # commands.append(command_default + "python %s/t06_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ # %(tool_default_dir,work_dir_711,subgroup,work_dir_711)) # commands.append("#diff pathway") # mkdir("%s/pathway/"%work_dir_711) # commands.append("Rscript %s/707_compare_pathway.R %s/diff.marker.filter.profile.tsv %s %s/pathway/707_compare.txt"\ # %(bin_kegg_default_dir,work_dir_711,subgroup,work_dir_711)) # commands.append("#diff detail pathway") # work_dir_711_2 = "%s/detail_pathway/"%work_dir_711 # mkdir(work_dir_711_2) # commands.append("python %s/709_diff_map.py -i %s/diff.marker.filter.tsv -ko %s/gene_catalog.ko -g %s -o %s "\ # %(bin_kegg_default_dir,work_dir_711,work_dir,subgroup,work_dir_711_2)) # commands.append("# diff barplot") # commands.append("cut -f 1 %s/diff.marker.filter.tsv |while read line;do grep \"$line\" %s/gene_catalog.path >>%s/diff_gene_catalog.path;done"\ # %(work_dir_711,work_dir,work_dir_711)) # commands.append("sort %s/diff_gene_catalog.path |uniq > %s/diff_gene_catalog2.path"%(work_dir_711,work_dir_711)) # commands.append("rm %s/diff_gene_catalog.path"%work_dir_711) # mkdir("%s/path_barplot/"%work_dir_711) # commands.append("python %s/712_ko2path_bar.py -i %s/diff_gene_catalog2.path -g %s/diff.marker.filter.tsv -o %s/path_barplot/ -l 2"\ # %(bin_kegg_default_dir,work_dir_711,work_dir_711,work_dir_711)) ##function work_dir_712_2 = "%s/group/%s/12.functional_barplot"%(work_dir,subgroup_name) mkdir(work_dir_712_2) commands.append("##712.function_barplot") commands.append("Rscript %s/702_level1_barplot_withgroup.R %s/kegg_level1_profile.txt %s/level1_barplot_withgroup.pdf 1 %s"\ %(bin_kegg_default_dir,work_dir_712,work_dir_712_2,subgroup)) commands.append("convert -density 300 %s/level1_barplot_withgroup.pdf %s/level1_barplot_withgroup.png"\ %(work_dir_712_2,work_dir_712_2)) commands.append("Rscript %s/702_level1_barplot_withgroup.R %s/kegg_level2_profile.txt %s/level2_barplot_withgroup.pdf 2 %s"\ %(bin_kegg_default_dir,work_dir_712,work_dir_712_2,subgroup)) commands.append("convert -density 300 %s/level2_barplot_withgroup.pdf %s/level2_barplot_withgroup.png"\ %(work_dir_712_2,work_dir_712_2)) work_dir_713 = "%s/group/%s/13.functional_clust"%(work_dir,subgroup_name) mkdir(work_dir_713) commands.append("##713 sample cluster") commands.append(command_default + "python %s/t10_sample_clustering.py -i %s/kegg_level1_profile.txt -g %s -o %s/ -t \"KEGG Level1 Abundance in Samples\" "\ %(tool_default_dir,work_dir_712,subgroup,work_dir_713)) # roc work_dir_714 = "%s/group/%s/14.ROC"%(work_dir,subgroup_name) mkdir(work_dir_714) commands.append("##714 ROC") if sample_num_total >= 50 and min_sample_num_in_groups >=20: commands.append("cut -f1 %s/diff.marker.filter.tsv >%s/diff.list;Rscript %s/710_roc.R %s/diff.marker.filter.profile.tsv %s/diff.list %s %s/710_roc.pdf"\ %(work_dir_710,work_dir_710,bin_kegg_default_dir,work_dir_710,work_dir_710,subgroup,work_dir_714)) else: log = "sample_num_total >= 50 and min_sample_num_in_groups >=20" samp_num_enough(work_dir_714,log) print gettime("end kegg") return commands
def cazy(config, name): print gettime("start 12.cazy") commands = [] work_dir = "%s/%s" % (os.path.dirname(config), name) commands.append("## whole cazy analysis") commands.append("rm %s/blat/all.m8" % work_dir) commands.append("cat %s/blat/* > %s/blat/all.m8" % (work_dir, work_dir)) commands.append(const.command_default + "python %s/01.get_anno_info.py -i %s/blat/all.m8 -o %s" % (cazy_bin_dir, work_dir, work_dir)) commands.append(const.command_default + "python %s/02.get_profile_and_count.py -a %s/cazy.anno.tsv -p %s/../06.gene_profile/gene.profile -l 5 -c class -o %s"%\ (cazy_bin_dir, work_dir, work_dir, work_dir)) commands.append(const.command_default +"python %s/02.get_profile_and_count.py -a %s/cazy.anno.tsv -p %s/../06.gene_profile/gene.profile -l 6 -c type -o %s"%\ (cazy_bin_dir, work_dir, work_dir, work_dir)) commands.append(const.command_default + "python %s/02.get_profile_and_count.py -a %s/cazy.anno.tsv -p %s/../06.gene_profile/gene.profile -l 9 -c enzyme -o %s"%\ (cazy_bin_dir, work_dir, work_dir, work_dir)) commands.append("## 1212.function_barplot") work_dir_12 = "%s/12.functional_barplot" % work_dir mkdir(work_dir_12) all_levels = ["class", "type", "enzyme"] for level in all_levels: commands.append( const.command_default + "Rscript %s/710_level1_barplot.R %s/%s.profile %s/%s_barplot.pdf %s" % (cazy_bin_dir, work_dir, level, work_dir_12, level, level)) commands.append( "convert -density 300 %s/%s_barplot.pdf %s/%s_barplot.png" % (work_dir_12, level, work_dir_12, level)) config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,", config_gene.get("param", "group")) #all_methods = ['cazy_class', 'cazy_protein', 'cazy_enzyme'] for subgroup in group: dirname, subgroup_name, _ = get_name(subgroup) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( subgroup) sub_work_dir = "%s/group/%s" % (work_dir, subgroup_name) commands.append( "## ----------------------------------%s----------------------------------##" % (subgroup_name)) if sample_num_total >= 5: work_dir_1202 = "%s/02.heatmap/" % sub_work_dir mkdir(work_dir_1202) commands.append("##heatmap") commands.append( const.command_default + "python %s/6_heatmap.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1202)) work_dir_1203 = "%s/03.pca/" % sub_work_dir mkdir(work_dir_1203) commands.append("##pca") commands.append( const.command_default + "python %s/1_pca.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1203)) work_dir_1204 = "%s/04.pcoa/" % sub_work_dir mkdir(work_dir_1204) commands.append("##pcoa") commands.append( const.command_default + "python %s/2_pcoa.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1204)) if min_sample_num_in_groups >= 5: work_dir_1205 = "%s/05.nmds/" % sub_work_dir mkdir(work_dir_1205) commands.append("##nmds") commands.append( const.command_default + "python %s/3_nmds.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1205)) work_dir_1206 = "%s/06.anosim/" % sub_work_dir mkdir(work_dir_1206) commands.append("##anosim") commands.append( const.command_default + "python %s/4_anosim.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1206)) work_dir_1207 = "%s/07.mrpp/" % sub_work_dir mkdir(work_dir_1207) commands.append("##mrpp") commands.append( const.command_default + "python %s/5_mrpp.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, work_dir_1207)) if group_num >= 6 and group_num < 30: work_dir_1208 = "%s/08.flower/" % sub_work_dir mkdir(work_dir_1208) commands.append("##flower") commands.append(const.command_default + "perl %s/7_flower.pl %s/type.profile %s %s" % (tools_dir, work_dir, subgroup, work_dir_1208)) elif group_num >= 2 and group_num < 6: work_dir_1208 = "%s/08.venn/" % sub_work_dir mkdir(work_dir_1208) commands.append("##venn") commands.append( const.command_default + "python %s/7_venn_flower.py -i %s/type.profile -o %s -g %s --with_group" % (tools_dir, work_dir, sub_work_dir, subgroup)) if min_sample_num_in_groups >= 5: work_dir_1209 = "%s/09.ko_wilcoxon/" % sub_work_dir mkdir(work_dir_1209) commands.append("##diff") commands.append(const.command_default + "%s/8_diff.py -i %s/type.profile -g %s -o %s" % (tools_dir, work_dir, subgroup, sub_work_dir)) commands.append("# diff boxplot") commands.append(const.command_default + "python %s/9_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tools_dir, work_dir_1209, work_dir_1209, subgroup, work_dir_1209)) commands.append("# diff heatmap") commands.append(const.command_default + "python %s/6_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ %(tools_dir, work_dir_1209, subgroup, work_dir_1209)) work_dir_1210 = "%s/10.lefse/" % sub_work_dir mkdir(work_dir_1210) commands.append("## lefse") commands.append(const.command_default + "python %s/603_LEfSe.py -i %s/type.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g %s -o %s --LDA 2"\ %(cazy_bin_dir, work_dir, subgroup, work_dir_1210)) commands.append("#lefse heatmap") commands.append(const.command_default + "python %s/6_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ %(tools_dir, work_dir_1210, subgroup, work_dir_1210)) if group_num == 2 and min_sample_num_in_groups >= 5: #work_dir_1211 = "%s/11.metastats/" % sub_work_dir #mkdir(work_dir_1211) #commands.append("##metastats") #commands.append(const.command_default + "python %s/708_sample2profile.py -i %s/type.profile -g %s -o %s -f for_metastats.profile --num 100000"\ # %(cazy_bin_dir, work_dir, subgroup, sub_work_dir)) #commands.append(const.command_default + "Rscript %s/708_metastats.R %s/for_metastats.profile %s %s XX 0.05 TRUE"\ # %(cazy_bin_dir, work_dir, subgroup, work_dir_1211)) #commands.append("convert -density 300 %s/708_metastats_boxplot.pdf %s/708_metastats_boxplot.png"%(work_dir_1211,work_dir_1211)) #commands.append("# diff heatmap") #commands.append(const.command_default + "python %s/6_heatmap.py -i %s/diff.marker.filter.profile.tsv -g %s -o %s/heatmap/"\ # %(tools_dir, work_dir_1211, subgroup, work_dir_1211)) #work_dir_1212 = "%s/12.functional_barplot/" % sub_work_dir #mkdir(work_dir_1212) #commands.append("##711.function_barplot") #commands.append(const.command_default + "Rscript %s/702_level1_barplot_withgroup.R %s/class.profile %s/calss_barplot_withgroup.pdf class %s"\ # %(cazy_bin_dir, work_dir, work_dir_1211, subgroup)) #commands.append("convert -density 300 %s/calss_barplot_withgroup.pdf %s/calss_barplot_withgroup.png"%(work_dir_1211, work_dir_1211)) #commands.append(const.command_default + "Rscript %s/702_level1_barplot_withgroup.R %s/type.profile %s/type_barplot_withgroup.pdf type %s"\ # %(cazy_bin_dir, work_dir, work_dir_1211, subgroup)) #commands.append("convert -density 300 %s/type_barplot_withgroup.pdf %s/type_barplot_withgroup.png"%(work_dir_1211, work_dir_1211)) #commands.append(const.command_default + "Rscript %s/702_level1_barplot_withgroup.R %s/enzyme.profile %s/enzyme_barplot_withgroup.pdf enzyme %s"\ # %(cazy_bin_dir, work_dir, work_dir_1211, subgroup)) #commands.append("convert -density 300 %s/enzyme_barplot_withgroup.pdf %s/enzyme_barplot_withgroup.png"%(work_dir_1211, work_dir_1211)) work_dir_1213 = "%s/13.functional_clust/" % sub_work_dir mkdir(work_dir_1213) commands.append("##712 sample cluster") commands.append(const.command_default + "python %s/10_sample_clustering.py -i %s/type.profile -g %s -o %s -t \"Type Abundance in Samples\""\ %(tools_dir, work_dir, subgroup, work_dir_1213)) print gettime("end cazy") return commands