def gene_predict(config, name): print gettime("start 04.gene_predict") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") mkdir(work_dir) commands.append("ls gene/*fna | perl %s/stat.pl > orf.stat.tsv" % bin_gene_predict_default_dir) commands.append( "ls gff/*gff | sed 's/.gff//g' | while read a ; do gzip -c $a.gff > $a.gff.gz;done" ) commands.append("ls gene/*fna | sed 's/.fna//g' | while read a ; do perl %s/cds2pep.pl $a.fna $a.faa; gzip -c $a.fna > $a.fna.gz; gzip -c $a.faa > $a.faa.gz; done"\ %tool_default_dir) commands.append("## histogram") mkdir("%s/histogram/" % work_dir) commands.append( "cut -f 1 gene.list | while read a; do /data_center_03/USER/zhongwd/bin/lengthfasta gene/$a.gene.fna > histogram/$a.gene.length; done" ) commands.append("cut -f 1 gene.list | while read a; do Rscript %s/gene.histogram.R histogram/$a.gene.length histogram/$a.gene.histogram.pdf; done"\ %bin_gene_predict_default_dir) commands.append( "cut -f 1 gene.list | while read a; do convert -density 300 histogram/$a.gene.histogram.pdf histogram/$a.gene.histogram.png; done" ) print gettime("end 04.gene_predict") return commands
def mgs(config, name): print gettime("stat 10.mgs") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) material_dir = '%s/material' % os.path.dirname(config) if os.path.isdir(work_dir): pass else: mkdir(work_dir) config_group = ConfigParser() config_group.read(config) group = re.split('\s+|,\s*|\t+|,\t*|', config_group.get('param','group')) for (i,subgroup_name) in enumerate(group): #subgroup_filename = '0' + str((i+1)) + '.' + subgroup_name subgroup_filename = subgroup_name mkdir("%s/%s" % (work_dir, subgroup_filename)) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group("%s/%s_group.list" % (material_dir, subgroup_name)) if min_sample_num_in_groups >= 5 and sample_num_total >= 20 and group_num == 2: os.system("cp %s/%s_group.list %s/%s/group.list" % (material_dir, subgroup_name, work_dir, subgroup_filename)) commands.append("## mgs start") commands.append('ls | while read a; do if [ -f "$a/group.list" ];then python %s/full_MGS_llf.py -p ../../06.gene_profile/gene.profile -g $a/group.list -d $a/; fi; done' % (bin_mgs_default_dir)) commands.append('ls | while read a; do if [ -f "$a/group.list" ];then cd $a;sh work.sh;cd -; fi; done') commands.append('ls | while read a; do if [ -f "$a/group.list" ];then python %s/mgs_taxonomy.py -i $a/pathway/ -g ../05.gene_catalog/gene_catalog.fna -o $a/taxonomy/ --group $a/group.list; fi; done' % (bin_mgs_default_dir)) commands.append('ls | while read a; do if [ -f "$a/group.list" ];then cd $a/taxonomy/;sh mgs_taxonomy.sh;cd -; fi; done') else: log = open("%s/%s/Sample_not_enough.log" % (work_dir, subgroup_filename),"w+") log.write("min_sample_num_in_groups >= 5 and sample_num_total >= 20 and group_num == 2") log.close return commands
def use_old_version(config, name): print gettime('start create old version step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/old_version' % (main_dir, name) mkdir(work_dir, '%s/profile'%work_dir) commands.append('## calculate abundance') commands.append('cp %s/01.clean_reads/clean_reads.list ./' % main_dir) commands.append('%s/speciesabundance.pl %s/01.clean_reads/clean_reads.list .' % (bin_dir, main_dir)) commands.append('nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 15G --jobs 10 --prefix MA --lines 1 shell/match.sh &') commands.append('nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 15G --jobs 10 --prefix AB --lines 2 shell/abun.sh &') commands.append('## form species profile') commands.append('ls alignment/*/*root.abundance >abund.list') #commands.append('python %s/02_taxonomy.py -d . -c ../%s/qc_%s.stat.tsv' % (bin_dir, raw_dir_name, batch_num)) commands.append('python %s/02_taxonomy.py -i abund.list' % bin_dir) commands.append('rm abund.list') commands.append('for i in all phylum class order family genus species; do ls alignment/*/*$i.abundance |perl %s/201_profile - >profile/$i.profile; done' % bin_dir) #commands.append('cut -f1 %s/materials/sample.list |while read a; do ls alignment/$a/*phylum.abundance; done | profile - >profile/phylum.profilea'%main_dir) commands.append('num=1;for i in phylum class order family genus species; do let num=num+1; python %s/201_profile_convert.py -i profile/$i.profile -o profile/otu_table_L$num.txt; done' % bin_dir) commands.append('ls profile/* | while read a; do cp $a ../../taxon_profile; done') commands.append('## reads use rate') commands.append('#ls alignment/*/*.MATCH.logs >match_logs.list') commands.append('#python %s/201_use_rate.py -i match_logs.list -o use_rate.stat.tsv -clean %s/00.raw_reads/qc_stat.tsv' % (bin_dir, main_dir)) commands.append('#rm match_logs.list') return work_dir, commands
def gene_profile(config,sh_default_file,outpath,name): commands = [] work_dir = os.path.dirname(config) commands.append("/data_center_01/pipeline/huangy/metagenome/perlscript/06_geneabundance clean_reads_list gene_catalog.list gene_catalog.length") commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --resource vf=10G --maxjob 10 --jobprefix MA --lines 1 --getmem shell_alignment/match.sh &") commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --resource vf=10G --maxjob 10 --jobprefix AB --lines 2 --getmem shell_alignment/abun.sh &") commands.append("ls alignment/*/*abundance |/data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > gene.profile") commands.append("/data_center_01/pipeline/huangy/metagenome/perlscript/06_shannon gene.profile gene.alpha.div.tsv") commands.append("head -4 gene.profile | sed '1s/^/Gene ID/g' > example.gene.profile.tsv") commands.append("Rscript /data_center_01/pipeline/huangy/metagenome/Rscript/06_geneset.R") commands.append("#差异分析") config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t",config_gene.get("param","group")) mkdir("%s/group/" % work_dir) for subgroup in group: subgroup = os.path.basename(subgroup) subgroup_split =os.path.splitext(subgroup)[0] mkdir("%s/group/%s/"%(work_dir,subgroup_split)) commands.append("python /data_center_01/pipeline/huangy/metagenome/pyscript/convert_abundance_group.py gene.profile ../group/%s group/%s/gene.profile genus" % (subgroup,subgroup_split)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/03_otu_pca.py -i group/%s/gene.profile -g ../group/%s -o group/%s/09.pca --with_boxplot" % (subgroup_split,subgroup,subgroup_split)) mkdir("%s/group/%s/11-14.beta_div/"%(work_dir,subgroup_split)) mkdir("%s/group/%s/11-14.beta_div/gene/"%(work_dir,subgroup_split)) commands.append("cd group/%s/11-14.beta_div/gene; perl /data_center_01/pipeline/huangy/metagenome/perlscript/02_Beta_diversity.pl -p ../../../../group/%s/gene.profile -g ../../../../../group/%s -m bray -r; cd -" %(subgroup_split,subgroup_split,subgroup)) mkdir("%s/group/%s/15.LEfSe/" % (work_dir,subgroup_split)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/05_tax_diff.py -i group/%s/gene.profile -o group/%s/gene_diff/ -g ../group/%s -c 0.05"%(subgroup_split,subgroup_split,subgroup)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/05_diff_pca.py -i group/%s/gene_diff/profile.for_plot.txt -o group/%s/gene_diff/pca -g ../group/%s" %(subgroup_split,subgroup_split,subgroup)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/03_tax_heatmap.py -f group/%s/gene_diff/profile.for_plot.txt -o group/%s/gene_diff/heatmap -g ../group/%s -t 30" % (subgroup_split,subgroup_split,subgroup)) commands.append(" /data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/05_diff_boxplot.py -i group/%s/gene_diff/profile.for_plot.txt -o group/%s/gene_diff/boxplot -g ../group/%s -t 20"%(subgroup_split,subgroup_split,subgroup)) #commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/05_LEfSe.py -i group/%s/gene.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g ../group/%s -o group/%s/15.LEfSe/ --LDA 2" %(subgroup_split,subgroup,subgroup_split)) return commands
def gene_profile_pre(config, name): print gettime("start 06.gene_profile_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("cp %s/../01.clean_reads/clean_reads.list %s/clean_reads.list"\ %(work_dir,work_dir)) commands.append("## build index") mkdir("%s/database/" % work_dir) commands.append( "#ln -s %s/../05.gene_catalog/gene_catalog.fna %s/database/" % (work_dir, work_dir)) commands.append("#2bwt-builder %s/database/gene_catalog.fna" % work_dir) commands.append("python %s/genebuild.py -d %s" % (bin_gene_profile_default_dir, work_dir)) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 1 --prefix BI --lines 1 shell/2bwt_builder.sh" ) commands.append("## calculate gene abundance") commands.append("perl %s/geneabundance.pl %s/clean_reads.list database/gene_catalog.fna %s/../05.gene_catalog/gene_catalog.length %s/"\ %(bin_gene_profile_default_dir,work_dir,work_dir,work_dir)) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 50 --prefix MA --lines 1 shell/match.sh" ) commands.append( "/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 10G --jobs 10 --prefix AB --lines 2 shell/abun.sh" ) print gettime("end 06.gene_profile_pre") return commands
def use_other_method(config, name): print gettime('start create other step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/other' % (main_dir, name) mkdir(work_dir) print 'This method is not complete,please select other method!' return work_dir, commands
def use_kraken2_method(config, name): print gettime('start create kraken2 step script') work_dir, commands = '',[] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/kraken2' % (main_dir, name) mkdir(work_dir) print 'This method is not complete,please select other method!' return work_dir, commands
def prepare(): #前期处理 args = read_params(sys.argv) conf = ConfigParser() conf.read(args['config']) args['wdir'] = conf.get('param','work_dir') args['rdir'] = conf.get('param','raw_dir_name') args['rdir'] = '%s/%s' % (args['wdir'], args['rdir']) mkdir('%s/temp' % args['rdir']) return args
def write_(out_d, sub_group_list): for sub in sub_group_list: sub_c_dic = "%s/%s" % (out_d, "VS".join(sub.keys())) mkdir(sub_c_dic) sub_group_name = "%s/sub_group.list" % sub_c_dic with open(sub_group_name, 'w+') as sub_w: for sub_key in sub: for tmp in sub[sub_key]: sub_w.write("%s\t%s\n" % (tmp, sub_key))
def kegg_pre(config, name): print gettime("start 07.kegg_pre") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("perl %s/blatprot.pl /data_center_01/home/NEOLINE/zwd/project/PMO/LiuLin-ascites-stool/07.kegg/db.list %s/../05.gene_catalog/gene_catalog.split.list %s/"\ %(tool_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix KEGG --lines 1 shell/blat.sh &") print gettime("end 07.kegg_pre") return commands
def prepare(dir, file, host): cdir = os.path.abspath(dir) mkdir('%s/shell' % dir) rdir3 = '%s/../00.raw_reads/03.qc' % cdir rdir5 = '%s/../00.raw_reads/05.clean_reads' % cdir if os.path.exists(file): sample = [data.split()[1] for data in read_file(file)] else: sample = set(os.popen('ls %s/*/*.1.fq | while read a; do b=${a##*/}; echo ${b%%.*}; done' % rdir3).read().strip().split('\n')) host = 'yes' if host=='T' or host =='True' else '' return cdir, rdir3, rdir5, sample, host
def ardb_pre(config, name): print gettime("start 09.ardb_pre") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("cp /data_center_03/Project/AS/16_ARDB/db.list ./") commands.append("perl %s/blatprot.pl db.list %s/../05.gene_catalog/gene_catalog.split.list %s/"%(tool_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 5G --jobs 10 --prefix AR --lines 1 --getmem shell/blat.sh &") print gettime("end 09.ardb_pre") return commands
def clean_reads(config, name): print gettime("start raw_reads") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s' % (main_dir, name) mkdir(work_dir) commands.append('nohup python %s/merge.py -l %s/material/sample.list -c %s/ &' %\ (bin_defdir, main_dir, work_dir)) commands.append('awk -F "\\t" \'{print $1"\\t"$2"\\t"$3"\\t"$4"\\t"$5"\\t"$6"\\t"$7}\' %s/00.raw_reads/qc_*.stat.tsv > %s/qc_stat.tsv' %\ (main_dir, work_dir)) print gettime("end raw_reads") return commands
def eggnog_pre(config, name): print gettime("end 08.eggnog_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append( "perl %s/blatprot.pl /data_center_06/Project/pracrice/yehaocheng_20160120/08.eggnog/db.list %s/../05.gene_catalog/gene_catalog.split.list %s" % (tool_default_dir, work_dir, work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix NOG --lines 1 %s/shell/blat.sh &"\ %work_dir) print gettime("end 08.eggnog_pre") return commands
def gene_catalog_pre(config, name): commands = [] print gettime("start 05.gene_catalog_pre") work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## build gene catalog") commands.append("cat %s/../04.gene_predict/gene/*.fna > %s/redundant.gene_catalog.fna"\ %(work_dir,work_dir)) commands.append("perl %s/cd-hit.pl %s/redundant.gene_catalog.fna %s/gene_catalog.fna 20"\ %(bin_gene_catalog_default_dir,work_dir,work_dir)) print gettime("end 05.gene_catalog_pre") return commands
def cazy_pre(config, name): print gettime("start 12.cazy_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("perl %s/blatprot.pl /data_center_09/Project/lixr/00.DATA/CAZY_DB/db.list %s/../05.gene_catalog/gene_catalog.split.list %s"\ % (tools_dir, work_dir, work_dir)) commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 6G --jobs 10 --prefix CAZY --lines 1 shell/blat.sh &" ) print gettime("end 12.cazy_pre") return commands
def taxon_pre(config, name): main_dir = os.path.dirname(config) mkdir('%s/%s/taxon_profile' % (main_dir, name)) methods = ["snakemake","old_version","metaphlan2","kraken2"] for method in methods[:2]: if method == "snakemake": dir, commands = use_snakemake_method(config, name) elif method == "old_version": dir, commands = use_old_version(config, name) elif method == "metaphlan2": dir, commands = use_metaphlan2_method(config, name) else: dir, commands = use_kraken2_method(config, name) yield dir, commands
def gene_predict_pre(config, name): print gettime("start 04.gene_predict_pre") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") mkdir(work_dir) commands.append("## gene_predict") commands.append("perl %s/GenePredict.pl -s %s/../03.assembly/scaftigs.list -l 100 -d %s"\ %(bin_gene_predict_default_dir,work_dir,work_dir)) commands.append("nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 1G --jobs 10 --prefix GP --lines 2 %s/shell/predict.sh &"\ %work_dir) print gettime("end 04.gene_predict_pre") return commands
def cag(config, name): print gettime("stat 11.cag") work_dir = '%s/%s' % (os.path.dirname(config), name) material_dir = '%s/material' % os.path.dirname(config) if os.path.isdir(work_dir): pass else: mkdir(work_dir) config_group = ConfigParser() config_group.read(config) group = re.split('\s+|,\s*|\t+|,\t*|', config_group.get('param', 'group')) for (i, subgroup_name) in enumerate(group): #subgroup_filename = '0' + str((i+1)) + '.' + subgroup_name subgroup_filename = subgroup_name mkdir("%s/%s" % (work_dir, subgroup_filename)) sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( "%s/%s_group.list" % (material_dir, subgroup_name)) if sample_num_total < 20: log = open( "%s/%s/Sample_not_enough.log" % (work_dir, subgroup_filename), "w+") log.write("The minimum sample size (20) is not met.") log.close else: grp_sh = [] os.system( "cp %s/%s_group.list %s/%s/group.list" % (material_dir, subgroup_name, work_dir, subgroup_filename)) grp_sh.append( "python %s/full_CAG.py -p %s/../06.gene_profile/gene.profile -d %s/%s -g %s/%s/group.list" % (bin_cag_default_dir, work_dir, work_dir, subgroup_filename, work_dir, subgroup_filename)) grp_sh.append( "python %s/cag_taxonomy.py -i %s/%s/outfile/cag -g %s/../05.gene_catalog/gene_catalog.fna -o %s/%s/taxonomy/" % (bin_cag_default_dir, work_dir, subgroup_filename, work_dir, work_dir, subgroup_filename)) grp_sh.append("python %s/cag_exe_sequence.py -d %s/%s" % (bin_cag_default_dir, work_dir, subgroup_filename)) grp_sh.append("\n") with open('%s/%s/cag_pre.sh' % (work_dir, subgroup_filename), 'w') as outf: outf.write('\n'.join(grp_sh)) print gettime("end 11.cag")
def raw_reads(config, name): print gettime("start raw_reads") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s' % (main_dir, name) mkdir(work_dir) commands.append('python %s/QC_main.py -b %s/material/batch.list -c %s/material/config.list -p %s' %\ (bin_default_dir, main_dir, main_dir, config)) # commands.append("## Q20 Q30") # commands.append('cp %s/pipeline.cfg %s/pipeline.cfg' % (main_dir,work_dir)) # commands.append('# nohup python %s/Q20_Q30_stat.py -b %s/material/batch.list -c %s/pipeline.cfg -o . &' %\ # (bin_default_dir, main_dir, work_dir)) # commands.append('python %s/Q20_Q30_stat_python2_new.py -b %s/material/batch.list -c %s/pipeline.cfg -o . ' %\ # (bin_default_dir, main_dir, main_dir)) print gettime("end raw_reads") return commands
def use_snakemake_method(config, name): print gettime('start create snakemake step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_taxon_profile/snakemake_method' % (main_dir, name) mkdir(work_dir) # updata config.yaml with open(const.config_yaml,'r') as inf: data = yaml.load(inf) data['clean_reads_dir'] = '%s/01.clean_reads' % main_dir data['clean_reads_list'] = '%s/clean_reads.list' % work_dir data['outdir'] = '%s/alignment' % work_dir with open('%s/config.yaml' % work_dir, 'w') as outf: yaml.dump(data, outf, default_flow_style=False) # update cluster.yaml with open(const.cluster_yaml,'r') as inf: data = yaml.load(inf) data['__default__']['qsublog'] = '%s/log/' % work_dir data['align']['qsublog'] = '%s/log/align/' % work_dir data['abund']['qsublog'] = '%s/log/abund/' % work_dir data['abund_profile']['qsublog'] = '%s/log/' % work_dir with open('%s/cluster.yaml' % work_dir,'w') as outf: yaml.dump(data, outf, default_flow_style=False) # prepare file os.system('cp %s %s/Snakefile' % (const.snakemake, work_dir)) mkdir('%s/log/align/'%work_dir,'%s/log/abund/'%work_dir, '%s/profile'%work_dir) commands.append('cp %s/01.clean_reads/clean_reads.list .' % main_dir) commands.append('## calculate abundance') commands.append('source activate /data_center_03/USER/huangy/soft/MAIN/anaconda2/envs/gutbio') commands.append('snakemake --cluster-config cluster.yaml --cluster \'qsub -o {cluster.qsublog} -e {cluster.qsublog} -l vf={cluster.vf} -q {cluster.queue}\' -j 40 --nolock') commands.append('source deactivate') commands.append('## form species profile') commands.append('ls alignment/*/*root.abundance >abund.list') #commands.append('python %s/02_taxonomy.py -d . -c ../%s/qc_%s.stat.tsv' % (bin_dir, raw_dir_name, batch_num)) commands.append('python %s/02_taxonomy.py -i abund.list' % bin_dir) commands.append('rm abund.list') commands.append('for i in all phylum class order family genus species; do ls alignment/*/*$i.abundance |perl %s/201_profile - >profile/$i.profile; done' % bin_dir) commands.append('num=1;for i in phylum class order family genus species; do let num=num+1; python %s/201_profile_convert.py -i profile/$i.profile -o profile/otu_table_L$num.txt; done' % bin_dir) commands.append('ls profile/* | while read a; do cp $a ../../taxon_profile; done') commands.append('## reads use rate') commands.append('#ls alignment/*/*.MATCH.logs >match_logs.list') commands.append('#python %s/201_use_rate.py -i match_logs.list -o use_rate.stat.tsv -clean %s/00.raw_reads/qc_stat.tsv' % (bin_dir, main_dir)) commands.append('#rm match_logs.list') return work_dir, commands
def qc_prepare(batch, sample_list, dir, host, type): sub_dir = [] dir = dir.split()[1] host = host.split()[1] if not sample_list and not os.path.exists(sample_list): return scr_dir = os.path.dirname(os.path.abspath(__file__)) if host: selects = ['00.raw_reads','01.fastqc','02.rmadaptor','03.qc','04.rmhost','05.clean_reads'] else: selects = ['00.raw_reads','01.fastqc','02.rmadaptor','03.qc'] for name in selects: mkdir('%s/%s/%s' % (dir, name, batch)) sub_dir.append('%s/%s/%s' % (dir, name, batch)) mkdir('%s/shell' % dir) type = '-y' if type == '--type 33' else '' os.system('cp -f %s %s/%s_sample.list' % (sample_list, sub_dir[0], batch)) return sample_list, sub_dir, type, scr_dir, dir, type, host
def assembly_soapdenove(config, name): print gettime("start 03.assembly soapdenove method") commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/soapdenove' % (main_dir, name) # beginning assembly file commands.append("## best contigs") # commands.append('ls assembly/*/*/*scafSeq |while read a; do perl /data_center_06/Project/LiuLin-ascites-stool/03.assembly/bin/scaftigs.pl $a 500 ${a%%.*}.scaftigs.fna ${a%%.*}.scaftigs.stat; done') commands.append( 'ls assembly/*/*/*scafSeq |while read a; do perl %s/scaftigs.pl $a 500 ${a%%.*}.scaftigs.fna ${a%%.*}.scaftigs.stat; done' % bin_dir) commands.append( "/data_center_03/USER/zhongwd/bin/list assembly/*/* >%s/list.txt" % work_dir) commands.append( "python %s/best_scaftigs_selecter.py -i %s/list.txt -o %s/best_scaftigs" % (bin_dir, work_dir, work_dir)) commands.append("rm %s/list.txt" % work_dir) #commands.append("/data_center_03/USER/zhongwd/bin/list best_scaftigs/*stat | perl /data_center_07/Project/RY2015K16A01-1/03.assembly/bin/stat.pl > %s/scaftigs.best.stat.tsv" % work_dir) commands.append( "/data_center_03/USER/zhongwd/bin/list best_scaftigs/*stat | perl %s/stat.pl > %s/scaftigs.best.stat.tsv" % (bin_dir, work_dir)) commands.append("## histogram") mkdir("%s/histogram/" % work_dir) commands.append( "ls best_scaftigs/*.scaftigs.fna | sed 's#best_scaftigs/\(.*\).fna#\\1#g' | while read a; do lengthfasta best_scaftigs/$a.fna >histogram/$a.length; done" ) commands.append( "ls histogram/*.scaftigs.length |while read a; do Rscript %s/scaftigs_length.R $a ${a%%.*}.histogram.pdf; done" % bin_dir) commands.append( "ls histogram/*.pdf |while read a; do convert -density 300 $a ${a%%.*}.png; done" ) commands.append("## upload") commands.append( "ls best_scaftigs/*fna |while read a ; do gzip -c $a >${a%%.*}.fna.gz; done" ) commands.append("md5sum best_scaftigs/*.gz > best_scaftigs/scaftigs.md5") commands.append( 'ls best_scaftigs/*scaftigs.fna | while read a;do b=${a##*/};echo -e "${b%%.*}\\t`pwd $a`/$a";done > ../../scaftigs.list' ) print gettime("end 03.assembly") return work_dir, commands
def use_megahit_version(config, name): print gettime('start create megahit step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/megahit' % (main_dir, name) mkdir(work_dir) # prepare assembly file commands.append("## assembly") commands.append("cp %s/01.clean_reads/clean_reads.list %s" % (main_dir, work_dir)) commands.append( "perl %s/megahit_shell_maker.pl -l clean_reads.list -d %s" % (bin_dir, work_dir) ) # 参考:/data_center_11/Project/wenpp/01.wujianrong_20180822/03.assembly/assembly_megahit commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue neo.q --memery 30G --jobs 2 --lines 1 --prefix megahit shell/assembly.sh &" ) print gettime("end assembly_pre") return work_dir, commands
def use_soapdenove_method(config, name): print gettime('start create soapdenove step script') commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/preprocess_for_assembly/soapdenove' % (main_dir, name) mkdir(work_dir) # prepare assembly file config_gene = ConfigParser() config_gene.read(config) ins_list = config_gene.get("param", "ins_list") commands.append("## assembly") commands.append("cp %s/01.clean_reads/clean_reads.list %s/" % (main_dir, work_dir)) #commands.append("perl /data_center_03/USER/zhongwd/rd/12_soap_denovo/soapdenovo_shell_maker.pl -l clean_reads.list -i %s -minkmer 51 -maxkmer 63 -b 4 -d %s/"%( ins_list,work_dir)) commands.append( "perl %s/soapdenovo_shell_maker.pl -l clean_reads.list -i %s -minkmer 51 -maxkmer 63 -b 4 -d %s/" % (bin_dir, ins_list, work_dir)) commands.append( "nohup /data_center_03/USER/zhongwd/bin/qsge --queue big.q:all.q:all.q:all.q --memery 100G:5G:10G:3G --jobs 2 --lines 4 --prefix AS shell/assembly.sh &" ) print gettime("end assembly_pre") return work_dir, commands
def eggnog(config, name): print gettime("end 08.eggnog") commands = [] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("rm %s/blat/all.m8" % work_dir) commands.append("cat %s/blat/* > %s/blat/all.m8" % (work_dir, work_dir)) commands.append(command_default + "python %s/701_pick_blast_m8.py -i %s/blat/all.m8 -o %s/eggnog.m8"%\ (bin_kegg_default_dir,work_dir,work_dir)) commands.append(command_default + "perl %s/03_get_annot_info.pl %s/eggnog.m8 /data_center_02/Database/eggNOGv4.0/all.members.txt /data_center_02/Database/eggNOGv4.0/all.description.txt /data_center_02/Database/eggNOGv4.0/all.funccat.txt %s/eggnog.m8.tab"%\ (bin_eggnog_default_dir,work_dir,work_dir)) commands.append("perl %s/04_get_count.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt %s/eggnog.tab"%\ (bin_eggnog_default_dir,work_dir,work_dir)) commands.append(command_default + "perl /data_center_07/Project/RY2015K16A01-1/08.eggnog/bin/eggnog.annotation.pl < %s/eggnog.m8.tab > %s/eggnog.anno.tsv"%\ (work_dir,work_dir)) #获取分组名称 config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,\s*|,\t+", config_gene.get("param", "group")) sample_names = config_gene.get("param", "sample_name") sample_num_in_groups, min_sample_num_in_groups, sample_num_total, group_num = parse_group( sample_names) if sample_num_total <= 10: mkdir("%s/samples" % work_dir) commands.append("cut -f 1 %s/../01.clean_reads/clean_reads.list | while read a ; do cut -f 1 %s/../06.gene_profile/alignment/$a/$a.gene.abundance > %s/samples/$a.gene.list; done"%\ (work_dir,work_dir,work_dir)) commands.append("ls %s/samples/*gene.list | sed 's/.gene.list//g'|while read a; do perl %s/04_get_countlist.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt $a.gene.list $a.eggnog.tab;done"%\ (work_dir,bin_eggnog_default_dir,work_dir)) commands.append("ls %s/samples/*.eggnog.tab | sed 's/.eggnog.tab//g' | while read a;do cut -f 3,4 $a.eggnog.tab > $a.eggnog.count.tab; done"%\ (work_dir)) commands.append("ls %s/samples/*.eggnog.count.tab | /data_center_03/USER/zhongwd/bin/profile - > %s/eggnog.count.tab"%\ (work_dir,work_dir)) commands.append("Rscript /data_center_04/Projects/pichongbingdu/pair_reads/08.eggnog/NOG.R %s/eggnog.count.tab"%\ work_dir) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) work_dir_01 = "%s/group/%s/" % (work_dir, subgroup_name) mkdir(work_dir_01) commands.append( "## ----------------------------------%s----------------------" % (subgroup_name)) commands.append("cd %s; perl /data_center_06/Project/pracrice/yehaocheng_20160120/08.eggnog/bin/profile2list.pl %s %s/../06.gene_profile/gene.profile; cd -"%\ (work_dir_01,subgroup,work_dir)) commands.append("ls %s/*gene.list | sed 's/.gene.list//g'|while read a; do perl %s/04_get_countlist.pl %s/eggnog.m8.tab /data_center_02/Database/eggNOGv4.0/eggnogv4.funccats.txt $a.gene.list $a.eggnog.tab;done"%\ (work_dir_01,bin_eggnog_default_dir,work_dir)) commands.append("ls %s/*.eggnog.tab | sed 's/.eggnog.tab//g' | while read a;do cut -f 3,4 $a.eggnog.tab > $a.eggnog.count.tab; done"%\ (work_dir_01)) commands.append("ls %s/*.eggnog.count.tab | /data_center_03/USER/zhongwd/bin/profile - > %s/eggnog.count.tab"%\ (work_dir_01,work_dir_01)) commands.append("cd %s;Rscript /data_center_04/Projects/pichongbingdu/pair_reads/08.eggnog/NOG.R eggnog.count.tab;cd -"%\ (work_dir_01)) commands.append("convert -density 300 %s/NOG.pdf %s/NOG.png" % (work_dir_01, work_dir_01)) print gettime("end 08.eggnog") return commands
def ardb(config, name): print gettime("start 09.ardb") commands=[] work_dir = '%s/%s' % (os.path.dirname(config), name) mkdir(work_dir) commands.append("## blat mapping") commands.append("cat blat/* > all.m8") commands.append("pick_blast_m8 all.m8 > ardb.m8") commands.append("cut -f 2 ardb.m8 | search - /data_center_03/Project/AS/16_ARDB/old/ardbAnno1.0_modify_db07/tabs/ardb.tab | paste ardb.m8 - | cut -f 1,13- > gene2ardb.tsv") commands.append("classprofile -i gene2ardb.tsv -p ../06.gene_profile/gene.profile -f 3 > ardb.type.profile") commands.append("classprofile -i gene2ardb.tsv -p ../06.gene_profile/gene.profile -f 4 > ardb.class.profile") commands.append("Rscript /data_center_07/Project/RY2015K16A01-1/09.ardb/bin/ardb.barplot.r\n") commands.append("(echo -e 'Gene ID\tProtein name\tType\tClass\tDescription'; cat gene2ardb.tsv) > ardb.anno.tsv") # groups config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t|,\s*|,\t+",config_gene.get("param","group")) for subgroup_name in group: subgroup = '%s/material/%s_group.list' % (os.path.dirname(config), subgroup_name) sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group(subgroup) commands.append("## ----------------------------------%s----------------------"%(subgroup_name)) # diff work_dir_901 = "%s/group/%s/01.class_diff/" % (work_dir,subgroup_name) mkdir(work_dir_901) work_dir_902 = "%s/group/%s/02.type_diff/" % (work_dir,subgroup_name) mkdir(work_dir_902) commands.append("#01 diff class") commands.append(command_default + "python %s/t08_diff.py -i %s/ardb.class.profile -g %s -o %s" % (tool_default_dir, work_dir,subgroup, work_dir_901)) commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_901,work_dir_901,subgroup,work_dir_901)) commands.append("#02 diff type") commands.append(command_default + "python %s/t08_diff.py -i %s/ardb.class.profile -g %s -o %s" % (tool_default_dir, work_dir,subgroup, work_dir_902)) commands.append(command_default + "python %s/t09_diff_boxplot.py -i %s/diff.marker.filter.profile.tsv -p %s/diff.marker.filter.tsv -g %s -o %s/diff_boxplot/"\ %(tool_default_dir,work_dir_902,work_dir_902,subgroup,work_dir_902)) commands.append("#03 function_barplot") commands.append(command_default + "Rscript %s/710_level1_barplot.R %s/ardb.class.profile %s/group/%s/ardb.class.pdf Class %s"\ % (bin_ardb_default_dir, work_dir, work_dir, subgroup_name, subgroup)) commands.append("convert -density 300 %s/group/%s/ardb.class.pdf %s/group/%s/ardb.class.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) commands.append(command_default + "Rscript %s/710_level1_barplot.R %s/ardb.type.profile %s/group/%s/ardb.type.pdf Type %s"\ % (bin_ardb_default_dir, work_dir, work_dir, subgroup_name, subgroup)) commands.append("convert -density 300 %s/group/%s/ardb.type.pdf %s/group/%s/ardb.type.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) if group_num==2: commands.append("#04 dimond swarm") commands.append(command_default + "Rscript %s/dimond_swarm.R %s/ardb.type.profile %s %s/group/%s/dimond_swarm.pdf"\ % (bin_ardb_default_dir, work_dir, subgroup, work_dir, subgroup_name)) commands.append("convert -density 300 %s/group/%s/dimond_swarm.pdf %s/group/%s/dimond_swarm.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) commands.append("#05 top ardb") commands.append(command_default + "Rscript %s/top_ardb.R %s/ardb.type.profile %s %s/group/%s/top_ardb.pdf"\ % (bin_ardb_default_dir, work_dir, subgroup, work_dir, subgroup_name)) commands.append("convert -density 300 %s/group/%s/top_ardb.pdf %s/group/%s/top_ardb.png" % (work_dir, subgroup_name, work_dir, subgroup_name)) print gettime("end 009.ardb") return commands
help="set the output dir") parser.add_argument('--with_boxplot', dest='with_boxplot', action='store_true', help="plot boxplot") parser.add_argument('--without_boxplot', dest='with_boxplot', action='store_false', help="unplot boxplot") parser.add_argument('--two_legend', dest='two_legend', action = 'store_true',default=False, help="two_legend") parser.set_defaults(with_boxplot=True) args = parser.parse_args() params = vars(args) return params if __name__ == '__main__': params = read_params(sys.argv) mkdir(params['out_dir']) pdf_file = params['out_dir'] + '/otu_pca.pdf' png_file = params['out_dir'] + '/otu_pca.png' vars = {'otu_profile': params['otu_table'], 'group_file': params['group'], 'pdf_file': pdf_file} r_job = Rparser() if params['two_legend']: if params['with_boxplot']: r_job.open(const.Rscript + '/02_tax_pca_two.R') else: r_job.open(const.Rscript + '/02_tax_pca_two.R') else: if params['with_boxplot']: r_job.open(const.Rscript + '/02_tax_pca_with_boxplot.R')
def taxon(config,sh_default_file,outpath,name): print("start taxon :%s s"%time()) commands = [] work_dir = os.path.dirname(config) pyscript_dir = const.PYscript #updata config.yaml f = open(const.config_yaml) yl = yaml.load(f) f.close() yl["work_dir"]="%s/../"%work_dir yl["clean_reads_list"] = "02.tax/clean_read.list" fout = open('%s/config.yaml'%work_dir, "w") yaml.dump(yl,fout,default_flow_style=False) fout.close() os.system("cp %s %s/Snakefile"%(const.snakemake,work_dir)) os.system("cp %s %s/cluster.yaml"%(const.cluster_yaml,work_dir)) mkdir("%s/log/"%work_dir) mkdir("%s/log/align/"%work_dir) mkdir("%s/log/abund/"%work_dir) commands.append("## calculate abundance") commands.append("source activate gutbio") commands.append("snakemake --cluster-config cluster.yaml --cluster 'qsub -o {cluster.qsublog} -e {cluster.qsublog} -l vf={cluster.vf}' -j 10 --nolock --config clean_reads_list=\"02.taxon/clean_reads_list\"") commands.append("source deactivate") commands.append("## form species profile") commands.append("ls alignment/*/*species.abundance >list") commands.append("python /data_center_01/pipeline/huangy/metagenome/pyscript/02_taxnomy.py -i list") commands.append("rm list") mkdir("%s/profile/" % work_dir) commands.append("ls alignment/*/*species.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/species.profile") commands.append("ls alignment/*/*species.abundance2 | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/species.profile2") commands.append("ls alignment/*/*genus.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/genus.profile") commands.append("ls alignment/*/*class.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/class.profile") commands.append("ls alignment/*/*family.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/family.profile") commands.append("ls alignment/*/*order.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/order.profile") commands.append("ls alignment/*/*phylum.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/phylum.profile") commands.append("ls alignment/*/*all.abundance | /data_center_01/pipeline/huangy/metagenome/perlscript/02_profile - > profile/all.profile") commands.append("echo '# Constructed from biom file' >profile/otu_table_L2.txt") commands.append("echo '# Constructed from biom file' >profile/otu_table_L3.txt") commands.append("echo '# Constructed from biom file' >profile/otu_table_L4.txt") commands.append("echo '# Constructed from biom file' >profile/otu_table_L5.txt") commands.append("echo '# Constructed from biom file' >profile/otu_table_L6.txt") commands.append("echo '# Constructed from biom file' >profile/otu_table_L7.txt") commands.append("cat profile/phylum.profile >> profile/otu_table_L2.txt") commands.append("cat profile/class.profile >> profile/otu_table_L3.txt") commands.append("cat profile/order.profile >> profile/otu_table_L4.txt") commands.append("cat profile/family.profile >> profile/otu_table_L5.txt") commands.append("cat profile/genus.profile >> profile/otu_table_L6.txt") commands.append("cat profile/species.profile2 >> profile/otu_table_L7.txt") commands.append("## use rate") commands.append("#mkdir use_rate") commands.append("#ls alignment/*/*MATCH |while read a; do echo \"perl /data_center_03/USER/zhongwd/rd/11_taxonomy_V2.0/bin/stat.pl < $a > $a.stat\" ;done > use_rate/stat.sh") commands.append("#nohup /data_center_03/USER/zhongwd/bin/qsge --queue all.q --resource vf=1G --maxjob 10 --jobprefix ST --lines 1 --getmem use_rate/stat.sh &") commands.append("#ls alignment/*/*MATCH.stat | perl /data_center_03/USER/zhongwd/rd/11_taxonomy_V2.0/bin/stat_tab.pl - ../00.raw_reads/qc_final.stat.tsv > use_rate/stat.tsv") config_gene = ConfigParser() config_gene.read(config) group = re.split("\s+|\t",config_gene.get("param","group")) mkdir("%s/" % work_dir) mkdir("%s/group/" % work_dir) commands.append("## 00.piechart need finish") mkdir("%s/group/00.piechart"%(work_dir)) commands.append("ls alignment/*/*species.abundance | sed 's/alignment\/\(.*\)\/.*species.abundance/\\1/g' | while read a ; do perl /data_center_03/USER/zhongwd/rd/11_taxonomy_V2.0/test/pieplot/pie.pl < alignment/$a/$a.species.abundance > group/00.piechart/$a.species.pie.svg;done") commands.append("ls alignment/*/*genus.abundance | sed 's/alignment\/\(.*\)\/.*genus.abundance/\\1/g' | while read a ; do perl /data_center_03/USER/zhongwd/rd/11_taxonomy_V2.0/test/pieplot/pie.pl < alignment/$a/$a.genus.abundance > group/00.piechart/$a.genus.pie.svg;done") commands.append("## 03.accum") mkdir("%s/03.accum_share"%(work_dir)) commands.append("ln -s ../profile/genus.profile 03.accum_share/") commands.append("ln -s ../profile/species.profile 03.accum_share/") commands.append("perl /data_center_03/USER/zhongwd/rd/Finish/07_acumm_share_curve/Accumulated_Shared_Curve.pl -p 03.accum_share/genus.profile -c genus -t 100") commands.append("perl /data_center_03/USER/zhongwd/rd/Finish/07_acumm_share_curve/Accumulated_Shared_Curve.pl -p 03.accum_share/species.profile -c species -t 100") commands.append("## 04.rarecurve") mkdir("%s/04.rarecurve"%(work_dir)) commands.append("#list alignment/*/*MATCH > 04.rarecurve/match.list; sed 's/.*alignment\/\(.*\)\/.*MATCH/\\1/g' 04.rarecurve/match.list | paste - 04.rarecurve/match.list > 04.rarecurve/match.list.tmp; mv -f 04.rarecurve/match.list.tmp 04.rarecurve/match.list") commands.append("#nohup perl /data_center_03/USER/zhongwd/rd/05_rarecurve/RareCurve/RareCurve.pl -s clean_reads_list -m 04.rarecurve/match.list -d 04.rarecurve &") commands.append("## 06.ternaryplot") mkdir("%s/06.ternaryplot"%(work_dir)) commands.append("Rscript /data_center_01/pipeline/huangy/metagenome/Rscript/02_ternary.R profile/species.profile sample.list 06.ternaryplot/species.ternary.pdf species") commands.append("Rscript /data_center_01/pipeline/huangy/metagenome/Rscript/02_ternary.R profile/genus.profile sample.list 06.ternaryplot/genus.ternary.pdf genus") commands.append("## 07.treeplot") mkdir("%s/07.treeplot"%(work_dir)) commands.append("cut -f 1 clean_reads_list | while read a; do mkdir 07.treeplot/$a; perl /data_center_03/USER/zhongwd/temp/0106/tree/a.pl < alignment/$a/$a.species.abundance > 07.treeplot/$a/test.info 2> 07.treeplot/$a/test.tax; done") commands.append("cut -f 1 clean_reads_list | while read a; do cd 07.treeplot/$a; perl /data_center_03/USER/zhongwd/temp/0106/tree/zwd_newwick.pl < test.tax > test.tre; ~/anaconda_ete/bin/python /data_center_03/USER/zhongwd/temp/0106/tree/plottre.py; cd -; done") commands.append("## 08.cluster") mkdir("%s/08.cluster"%(work_dir)) commands.append("Rscript /data_center_03/USER/zhongwd/rd/11_taxonomy_V2.0/test/barplot/bartreeplot.r profile/species.profile sample.list 08.cluster/species.clust.pdf") for subgroup in group: dirname,subgroup_name,_ = get_name(subgroup) _,min_sample_num_in_groups,sample_num_total,group_num=parse_group(subgroup) mkdir("%s/group/%s"%(work_dir,subgroup_name)) commands.append("## 01.barplot need finish") mkdir("%s/group/%s/01.barplot"%(work_dir,subgroup_name)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/02_bar_plot.py -t %s/profile/ -o %s/group/%s/01.barplot/ \ -g %s "%(work_dir,work_dir,subgroup_name,subgroup)) commands.append("/data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/02_bar_plot.py -t %s/profile/ -o %s/group/%s/01.barplot/ \ -g %s --level 7 "%(work_dir,work_dir,subgroup_name,subgroup)) #commands.append("%s/02_bar_plot.py -i %s/profile/genus.profile -o %s/group/%s/01.barplot/genus.pdf \ #-g %s -t %s"%(pyscript_dir,work_dir,work_dir,subgroup_name,subgroup,"genus")) commands.append("## 02.core") mkdir("%s/group/%s/02.core"%(work_dir,subgroup_name)) commands.append("python %s/02_venn.py -i %s/profile/otu_table_L7.txt -o %s/group/%s/02.core/%s/ -g %s "\ %(pyscript_dir,work_dir,work_dir,subgroup_name,"species",subgroup)) commands.append("python %s/02_venn.py -i %s/profile/otu_table_L7.txt -o %s/group/%s/02.core/%s/ -g %s "\ %(pyscript_dir,work_dir,work_dir,subgroup_name,"genus",subgroup)) commands.append("## 05.top_boxplot") mkdir("%s/group/%s/05.top_boxplot"%(work_dir,subgroup_name)) commands.append("python %s/02_top.py -i %s/profile/all.profile -g %s -o %s/group/%s/05.top_boxplot/"%(pyscript_dir,work_dir,subgroup,work_dir,subgroup_name)) commands.append("## 09.pca") mkdir("%s/group/%s/09.pca"%(work_dir,subgroup_name)) commands.append("python %s/02_otu_pca.py -i %s/profile/species.profile -g %s -o %s/group/%s/09.pca --with_boxplot"%\ (pyscript_dir,work_dir,subgroup,work_dir,subgroup_name)) commands.append("python %s/02_otu_pca.py -i %s/profile/genus.profile -g %s -o %s/group/%s/09.pca --with_boxplot"%\ (pyscript_dir,work_dir,subgroup,work_dir,subgroup_name)) commands.append("## 11.anosim; 13.pcoa; 14.nmds") mkdir("%s/group/%s/11-14.beta_div"%(work_dir,subgroup_name)) mkdir("%s/group/%s/11-14.beta_div/species"%(work_dir,subgroup_name)) mkdir("%s/group/%s/11-14.beta_div/genus"%(work_dir,subgroup_name)) commands.append("cd group/%s/11-14.beta_div/species; perl /data_center_01/pipeline/huangy/metagenome/perlscript/02_Beta_diversity.pl -p ../../../../profile/species.profile -g %s -m bray -r; cd -"%(subgroup_name,subgroup)) commands.append("cd group/%s/11-14.beta_div/genus; perl /data_center_01/pipeline/huangy/metagenome/perlscript/02_Beta_diversity.pl -p ../../../../profile/genus.profile -g %s -m bray -r; cd -"%(subgroup_name,subgroup)) mkdir("%s/group/%s/15.LEfSe"%(work_dir,subgroup_name)) commands.append("python /data_center_01/pipeline/16S_ITS_pipeline_v3.0/script/05_filter_abundance.py -i %s/profile/species.profile \ -g %s -o %s/group/%s/15.LEfSe/ --cut_off 1e-5"%(work_dir,subgroup,work_dir,subgroup_name)) commands.append("python %s/02_LEfSe.py -i %s/group/%s/15.LEfSe/species.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g %s -o %s/group/%s/15.LEfSe/ --LDA 2"\ %(pyscript_dir,work_dir,subgroup_name,subgroup,work_dir,subgroup_name)) print("end taxon :%s s"%time()) return commands
def group(config, name): main_dir = os.path.dirname(config) work_dir = '%s/%s/group_analysis' % (main_dir, name) pro_dir = '%s/%s/taxon_profile' % (main_dir, name) mkdir(work_dir) # get group summary config_gene = ConfigParser() config_gene.read(config) group = list(filter(None,re.split('\s+|\t|,\s*|,\t+',config_gene.get('param','group')))) sample_names = config_gene.get('param','sample_name') raw_dir_name = config_gene.get('param','raw_dir_name') alpha_group = config_gene.get('param','alpha_group') with open(config_gene.get('param','batch_list')) as fqin: for line in fqin: tabs = line.strip().split() if len(tabs)>1: batch_num = tabs[0] sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group(sample_names) for subgroup in group: grp_sh = [] mkdir('%s/%s' % (work_dir, subgroup)) os.system('cp %s/material/%s_group.list %s/%s/group.list' % (main_dir, subgroup, work_dir, subgroup)) sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group('%s/material/%s_group.list' % (main_dir, subgroup)) grp_sh.append('## 01.barplot Need finish') work_dir_201 = '%s/01.barplot' % subgroup mkdir('%s/%s' % (work_dir,work_dir_201)) grp_sh.append('%s/g01_barplot.py -t %s/ -o 01.barplot/ -g group.list' % (bin_dir, pro_dir)) grp_sh.append('%s/g01_barplot.py -t %s/ -o 01.barplot/ -g group.list --level 7' % (bin_dir, pro_dir)) grp_sh.append('## 02.venn_flower') work_dir_202 = '%s/02.venn_flower' % subgroup mkdir('%s/%s' % (work_dir, work_dir_202)) if group_num>=6 and group_num<=30: grp_sh.append('for i in phylum genus species; do perl %s/g02_flower.pl %s/$i.profile group.list 02.venn_flower/$i/; done' % (bin_dir, pro_dir)) #grp_sh.append('perl %s/g02_flower.pl profile/species.profile group.list 02.venn_flower' % bin_dir) #grp_sh.append('perl %s/g02_flower.pl profile/genus.profile group.list 02.venn_flower' % bin_dir) elif group_num>=2 and group_num<6: grp_sh.append('for i in phylum genus species; do perl %s/g02_flower.pl %s/$i.profile group.list 02.venn_flower/$i/; done ' % (bin_dir, pro_dir)) #grp_sh.append('python %s/g02_venn_flower.py -i profile/genus.profile -o 02.venn_flower -l genus -g group.list --with_group ' % bin_dir) #grp_sh.append('python %s/g02_venn_flower.py -i profile/species.profile -o 02.venn_flower -l species -g group.list --with_group ' % bin_dir) else: grp_sh.append('## sample too much') grp_sh.append('## 03.top_boxplot') work_dir_203 = '%s/03.top_boxplot' % subgroup mkdir('%s/%s' % (work_dir, work_dir_203)) grp_sh.append('python %s/g03_top_boxplot.py -i %s/all.profile -g group.list -o 03.top_boxplot/' % (bin_dir, pro_dir)) grp_sh.append('## 04.ternaryplot') work_dir_204 = '%s/04.ternaryplot' % subgroup mkdir('%s/%s' % (work_dir, work_dir_204)) if group_num == 2: grp_sh.append('for i in phylum genus species; do Rscript %s/g04_ternary.R %s/$i.profile group.list 04.ternaryplot/$i.ternary.pdf $i; done' % (bin_dir, pro_dir)) elif group_num == 3: grp_sh.append('for i in phylum genus species; do Rscript %s/t13_ternary.py -i %s/$i.profile -g group.list -o 04.ternaryplot/$i -c $i; done' % (tool_dir, pro_dir)) elif group_num >3 and group_num < 6: grp_sh.append('python %s/g04_ternary_sub_group.py -g group.list -o 04.ternaryplot/' % bin_dir) grp_sh.append('for i in `ls 04.ternaryplot/`;do for j in phylum genus species;do if [ -d 04.ternaryplot/$i ] ;then python %s/t13_ternary.py -i %s/$j.profile -g 04.ternaryplot/$i/sub_group.list -o 04.ternaryplot/$i/$j -c $j;fi;done;done' % (tool_dir,pro_dir)) grp_sh.append('## 05.top_barplot') work_dir_205 = '%s/05.top_barplot' % subgroup mkdir('%s/%s' % (work_dir, work_dir_205)) grp_sh.append('for i in phylum genus species; do python %s/t10_sample_clustering.py -i %s/$i.profile -g group.list -o 05.top_barplot/$i/ -t $i; done' % (tool_dir, pro_dir)) grp_sh.append('## 06.pca') work_dir_206 ='%s/06.pca' % subgroup mkdir('%s/%s' % (work_dir, work_dir_206)) grp_sh.append('for i in phylum genus species; do python %s/t01_pca.py -i %s/$i.profile -g group.list -o 06.pca/$i/ --with_boxplot; done' % (tool_dir, pro_dir)) grp_sh.append('## 07.pcoa') work_dir_207 = '%s/07.pcoa' % subgroup mkdir('%s/%s' % (work_dir, work_dir_207)) grp_sh.append('for i in phylum genus species; do python %s/t02_pcoa.py -i %s/$i.profile -g group.list -o 07.pcoa/$i/ --with_boxplot; done' % (tool_dir, pro_dir)) grp_sh.append('## 08.nmds') work_dir_208 = '%s/08.nmds' % subgroup mkdir('%s/%s' % (work_dir, work_dir_208)) grp_sh.append('for i in phylum genus species; do python %s/t03_nmds.py -i %s/$i.profile -g group.list -o 08.nmds/$i/; done' % (tool_dir, pro_dir)) grp_sh.append('## 09.anosim') work_dir_209 = '%s/09.anosim' % subgroup mkdir('%s/%s' % (work_dir, work_dir_209)) grp_sh.append('for i in phylum genus species; do python %s/t04_anosim.py -i %s/$i.profile -g group.list -o 09.anosim/$i/; done' % (tool_dir, pro_dir)) grp_sh.append('## 10.adonis') work_dir_209 = '%s/10.adonis' % subgroup mkdir('%s/%s' % (work_dir, work_dir_209)) grp_sh.append('for i in phylum genus species; do python %s/t12_adonis_pca.py -i %s/$i.profile -g group.list -o 10.adonis/$i/; done' % (tool_dir, pro_dir)) grp_sh.append('for i in phylum genus species; do python %s/t12_adonis_pcoa.py -i %s/$i.profile -g group.list -o 10.adonis/$i/; done' % (tool_dir, pro_dir)) grp_sh.append('## 11.mrpp') work_dir_210 = '%s/11.mrpp' % subgroup mkdir('%s/%s' % (work_dir, work_dir_210)) grp_sh.append('for i in phylum genus species; do python %s/t05_mrpp.py -i %s/$i.profile -g group.list -o 11.mrpp/$i/; done' % (tool_dir, pro_dir)) grp_sh.append('## 12.diff') work_dir_211 = '%s/12.diff' % subgroup mkdir('%s/%s' % (work_dir, work_dir_211)) grp_sh.append('for i in phylum genus species; do python %s/t08_diff.py -i %s/$i.profile -g group.list -o 12.diff/$i; done' % (tool_dir, pro_dir)) grp_sh.append('for i in phylum genus species; do Rscript %s/t08_new_diff.R 12.diff/$i/diff.marker.filter.profile.tsv 12.diff/$i/diff.marker.filter.tsv group.list 12.diff/$i/${i}_diff.pdf n add;convert -density 300 12.diff/$i/${i}_diff.pdf 12.diff/$i/${i}_diff.png; done' % tool_dir) grp_sh.append('## 13.lefse') work_dir_212 = '%s/13.lefse/' % subgroup mkdir('%s/%s' % (work_dir, work_dir_212)) grp_sh.append('for i in phylum genus species; do /data_center_03/USER/huangy/soft/MAIN/anaconda2/bin/python2.7 %s/t11_lefse.py -i %s/$i.profile -l /data_center_03/USER/huangy/soft/LEfSe_lzb -g group.list -o 13.lefse/$i --LDA 2; done' % (tool_dir, pro_dir)) with open('%s/%s/work.sh' % (work_dir, subgroup), 'w') as outf: outf.write('\n'.join(grp_sh))
def samples(config, name): commands = [] main_dir = os.path.dirname(config) work_dir = '%s/%s/samples_nalysis' % (main_dir, name) pro_dir = '%s/%s/taxon_profile' % (main_dir, name) mkdir(work_dir) # alignment目录 snakemake_method_dir = '%s/%s/preprocess_for_taxon_profile/snakemake_method' % (main_dir, name) # get group summary config_gene = ConfigParser() config_gene.read(config) sample_names = config_gene.get('param','sample_name') raw_dir_name = config_gene.get('param','raw_dir_name') alpha_group = config_gene.get('param','alpha_group') with open(config_gene.get('param','batch_list')) as fqin: for line in fqin: tabs = line.strip().split() if len(tabs)>1: batch_num = tabs[0] sample_num_in_groups,min_sample_num_in_groups,sample_num_total,group_num=parse_group(sample_names) commands.append('## 00.reads use rate ') commands.append('ls ../preprocess_for_taxon_profile/*/alignment/*/*.MATCH.logs > %s/list' % work_dir) commands.append('python %s/201_use_rate_new.py -i %s/list -o %s/use_rate.stat.tsv -clean %s/../../00.raw_reads/qc_rawdata.stat.tsv' % (bin_dir, work_dir, work_dir, work_dir)) commands.append('rm %s/list' % work_dir) commands.append('## 01.piechart Need finish') mkdir('%s/01.piechart' % work_dir) commands.append('for i in phylum genus species; do find .. -name *$i.abundance | while read a; do b=${a##*/}; perl %s/s01_pie.pl <$a >01.piechart/${b%%.*}.pie.svg; done; done' % bin_dir) #commands.append('ls alignment/*/*species.abundance |while read a; do b=${a##*/}; perl %s/00.piechart/pie.pl <$a >00.piechart/${b%%.*}.pie.svg; done' % bin_dir) #commands.append('ls alignment/*/*genus.abundance |while read a; do b=${a##*/}; perl %s/00.piechart/pie.pl <$a >00.piechart/${b%%.*}.pie.svg; done' % bin_dir) #commands.append('## 01.barplot') #mkdir('%s/01.barplot' % work_dir) #os.system('cp alpha_group 01.barplot/all_Sample.list' % (main_dir, subgroup, work_dir, subgroup)) #commands.append('%s/g01_barplot.py -t %s/ -o 01.barplot/ -g 01.barplot/all_Sample.list' % (bin_dir, pro_dir)) #commands.append('%s/g01_barplot.py -t %s/ -o 01.barplot/ -g 01.barplot/all_Sample.list --level 7' % (bin_dir, pro_dir)) commands.append('## 02.venn_flower') venn_dir = '02.venn_flower' mkdir('%s/02.venn_flower' % work_dir) if sample_num_total>=6 and sample_num_total<=30: commands.append('for i in phylum genus species; do perl %s/g02_flower.pl ../taxon_profile/$i.profile %s %s/$i/; done' % (bin_dir, sample_names, venn_dir)) #commands.append('perl %s/02.taxon/7_flower.pl profile/species.profile %s %s/species/' % (bin_dir, sample_names, vennf_dir)) #commands.append('perl %s/02.taxon/7_flower.pl profile/genus.profile %s %s/genus/' % (bin_dir, sample_names, vennf_dir)) elif sample_num_total>=2 and sample_num_total<6: commands.append('for i in phylum genus species; do perl %s/g02_flower.pl ../taxon_profile/$i.profile %s %s/$i/; done' % (bin_dir, sample_names, venn_dir)) #commands.append('python %s/02.taxon/7_venn_flower.py -i profile/species.profile -o %s -l species -g %s ' % (bin_dir, vennf_dir, sample_names)) #commands.append('python %s/02.taxon/7_venn_flower.py -i profile/genus.profile -o %s -l genus -g %s ' % (bin_dir, vennf_dir, sample_names)) else: commands.append('# sample too much') commands.append('## 03.accumulate') accum_dir = '03.accum_share' mkdir('%s/03.accum_share' % work_dir) commands.append('for i in phylum genus species; do ln %s/$i.profile %s/; done' % (pro_dir, accum_dir)) # commands.append('for i in phylum genus species; do ln %s/taxon_profile/$i.profile %s/; done' % (main_dir, accum_dir)) # by liulf #commands.append('ln %s/profile/species.profile %s/species.profile' % (work_dir, accum_dir)) #commands.append('ln %s/profile/genus.profile %s/genus.profile' % (work_dir, accum_dir)) commands.append('for i in phylum genus species; do perl %s/Accumulated_Shared_Curve.pl -p %s/$i.profile -c $i -t 100; done' % (bin_dir, accum_dir)) #commands.append('perl %s/03.accum/Accumulated_Shared_Curve.pl -p %s/genus.profile -c genus -t 100' % (bin_dir, accum_dir)) #commands.append('perl %s/03.accum/Accumulated_Shared_Curve.pl -p %s/species.profile -c species -t 100' % (bin_dir, accum_dir)) commands.append('## 04.rarecurve') rare_dir = '04.rarecurve' mkdir('%s/04.rarecurve' % work_dir) commands.append('ls %s/alignment |while read a; do echo "python %s/204_rarefaction.py -i %s/alignment/$a/$a.MATCH -o %s/alignment/$a/ -m 1820252 -b 200 -n $a"; done >%s/04.rarecurve/rarefaction.sh' % (snakemake_method_dir, bin_dir, snakemake_method_dir, snakemake_method_dir, work_dir)) commands.append('sh %s/04.rarecurve/rarefaction.sh' % work_dir) commands.append('rm %s/04.rarecurve/rarefaction.sh' % work_dir) commands.append('paste %s/alignment/*/rarefaction.tsv >%s/all.rarefaction.tsv' % (snakemake_method_dir, work_dir)) commands.append('Rscript %s/204_chao1.R %s/all.rarefaction.tsv %s %s/chao1.pdf %s/chao1.png' % (bin_dir, work_dir, alpha_group, rare_dir, rare_dir)) commands.append('#list %s/alignment/*/*MATCH > 04.rarecurve/match.list; sed \'s/.*alignment\/\(.*\)\/.*MATCH/\\1/g\' 04.rarecurve/match.list | paste - 04.rarecurve/match.list >04.rarecurve/match.list.tmp; mv -f 04.rarecurve/match.list.tmp 04.rarecurve/match.list % (snakemake_method_dir)') commands.append('#nohup perl /data_center_03/USER/zhongwd/rd/05_rarecurve/RareCurve/RareCurve.pl -s clean_reads_list -m 04.rarecurve/match.list -d 04.rarecurve &') commands.append('## 05.treeplot') tree_dir = '05.treeplot' mkdir('%s/05.treeplot' % work_dir) mkdir('%s/05.treeplot-shell' % work_dir) commands.append('cut -f1 ../../01.clean_reads/clean_reads.list | while read a; do mkdir -p %s/$a; echo "python %s/treeplot.tax.py -i %s/alignment/$a/$a.species.abundance -o %s/$a/"; done > %s/05.treeplot-shell/work.sh' % (tree_dir, bin_dir, snakemake_method_dir, tree_dir, work_dir)) commands.append('/data_center_03/USER/zhongwd/bin/qsge --queue all.q --memery 5G --jobs 13 --prefix py --lines 1 %s/05.treeplot-shell/work.sh' % work_dir) commands.append('cut -f1 ../../01.clean_reads/clean_reads.list | while read a; do cd %s/$a; perl %s/treeplot.newwick_new.pl test.info test.tax new_test.info>test.tre; python %s/treeplot.plot.py test.tre new_test.info tree.pdf; cd -; done' % (tree_dir, bin_dir, bin_dir)) with open('%s/work.sh' % work_dir, 'w') as outf: outf.write('\n'.join(commands))
print "sample %s no in group" % samples[ind-1] return otu_in_group def write(otu_in_group, outfile): with open(outfile, 'w') as fp: for group, otus in otu_in_group.iteritems(): otus = sorted(list(otus), cmp=lambda a, b: cmp(int(a), int(b))) fp.write('%s\t%s\n' % (group, ' '.join(otus))) if __name__ == '__main__': params = read_params(sys.argv) dirname=params['outputdir'] mkdir(dirname) for_plot = dirname + '/for_plot.txt' tiff_file = dirname + '/venn.tiff' png_file = dirname + '/venn.png' vars = {'for_plot': for_plot, 'tiff_file': tiff_file, 'group_file': params['group_dir']} otu_in_group = read(params['microbial_profile'], params['group'], vars) write(otu_in_group, for_plot) r_job = Rparser() r_job.open(const.Rscript + '02_venn.R') r_job.format(vars) r_job.write(dirname+ '/otu_venn.R') r_job.run()
default="bray", help= "please set method Dissimilarity index, partial match to manhattan euclidean \ canberra bray kulczynski jaccard gower altGower morisita horn\ mountford raup binomial chao cao mahalanobis") args = parser.parse_args() params = vars(args) return params if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf8') params = read_params(sys.argv) tool_default_dir = const.tool_default_dir mkdir(params['out_dir']) use_method = params["method"] pdf_file = params['out_dir'] + "/" + use_method + '_adonis_pcoa.pdf' png_file = params['out_dir'] + "/" + use_method + '_adonis_pcoa.png' env = Environment(loader=FileSystemLoader(tool_default_dir), autoescape=False) template = env.get_template("t12_adonis_pcoa.R") Rtxt = template.render(tool_default_dir = tool_default_dir,\ profile_table =params['profile_table'],\ group_file =params['group_file'],\ pdf_file = pdf_file,\ method = use_method) with open("%s/adonis_pcoa.R" % params["out_dir"], "w") as fqout: fqout.write(Rtxt) Rrun("%s/adonis_pcoa.R" % params["out_dir"]) image_trans(pdf_file, png_file)
if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf8') params = read_params(sys.argv) profile_table = params["profile_table"] profile_kegg = params["profile_kegg"] group_file = params["group"] out_dir = params["out_dir"] cutoff_p = params["cutoff"] cutoff_estimate = params["estimate"] level = params["level"] filter = params["filter"] default_dir = const.pipeline_dir pdf_out = "%s/corr_heatmap.pdf" % out_dir png_out = "%s/corr_heatmap.png" % out_dir mkdir(out_dir) env = Environment(loader=FileSystemLoader("%s/../alternative/" % const.bin_default_dir), autoescape=False) template = env.get_template("corr_heatmap.R") Rtxt = template.render(groupfile=group_file,taxfile=profile_table,keggfile=profile_kegg,\ cutoff_p=cutoff_p,cutoff_estimate=cutoff_estimate,\ default_dir=default_dir,pdf_out=pdf_out,level=level,\ filter=filter) with open("%s/corr_heatmap.R" % out_dir, "w") as fqout: fqout.write(Rtxt) os.system("Rscript %s/corr_heatmap.R" % out_dir) os.system("convert -density 300 %s %s" % (pdf_out, png_out))
params = vars(args) params['group'] = parse_group_file(params['group']) return params if __name__ == '__main__': params = read_params(sys.argv) outputfile = params['outputfile'] dirname,filename,suffix =get_name(outputfile) inputfile = params['inputfile'] top = params['top'] title = params['title'] data = pd.DataFrame.from_csv(file=inputfile,sep="\t") data["sum"] = data.sum(axis=1) data = data.sort_values(by="sum",ascending=False) del data["sum"] data = data.ix[:top] data.to_csv("%s/for_plot.csv"%dirname,sep="\t") mkdir(os.path.split(outputfile)[0]) RscriptDir = const.Rscript r_job = Rparser() r_job.open("%s/02_barplot.R"%RscriptDir) vars = {"top":top, "infile": "%s/for_plot.csv"%dirname, "pdf_file": outputfile, "title": title} r_job.format(vars) r_job.write("%s/bar_plot.R"%dirname) r_job.run() image_trans(300,"%s/%s.pdf"%(dirname,filename),"%s/%s.png"%(dirname,filename))