コード例 #1
0
ファイル: run_kegg.py プロジェクト: zhimenggan/RNAseq-4
 def run_kegg_pathview(self):
     cmd_list = []
     pathway_log_dir = path.join(self.out_dir, 'kegg_pathway_logs')
     python_tools.circ_mkdir_unix(pathway_log_dir)
     compare_list = listdir(self.diff_dir)
     for each_compare in compare_list:
         each_compare_diff_dir = path.join(self.diff_dir, each_compare)
         each_compare_out_dir = path.join(self.out_dir, each_compare)
         diff_out_list = glob(
             '{}/*.edgeR.DE_results.txt'.format(each_compare_diff_dir))
         for each_diff_file in diff_out_list:
             each_diff_file_name = path.basename(each_diff_file)
             each_out_prefix = each_diff_file_name.split(
                 '.edgeR.DE_results')[0]
             if 'UP' not in each_out_prefix:
                 each_out_prefix = '{}.ALL'.format(each_out_prefix)
             kegg_output = path.join(
                 each_compare_out_dir, '%s.kegg.enrichment.txt' % (each_out_prefix))
             pathway_outdir = path.join(
                 each_compare_out_dir, '%s.pathway' % each_out_prefix)
             pathview_check_log_file = path.join(
                 pathway_log_dir, '%s.log' % (each_out_prefix))
             pathview_cmd = 'python %s --kegg_table %s --blast_out %s --species %s --diff_out %s --out_dir %s' % (
                 PATHVIEW, kegg_output, self.all_blast_out, self.species, each_diff_file, pathway_outdir)
             pathview_check_cmd = 'python %s --kegg_table %s --pathway_dir %s --log_file %s' % (
                 PATHVIEW_CK, kegg_output, pathway_outdir, pathview_check_log_file)
             python_tools.circ_mkdir_unix(pathway_outdir)
             python_tools.circ_call_process(pathview_cmd)
             python_tools.circ_call_process(pathview_check_cmd)
             cmd_list.extend([pathview_cmd, pathview_check_cmd])
     return cmd_list
コード例 #2
0
ファイル: mRNA_pipe_v1.py プロジェクト: zhimenggan/RNAseq-4
 def run_result(self):
     add_pipe_message(self.monitor_dir, 'generate_result_start')
     result_dir = path.join(self.project_dir, self.project_name)
     if path.exists(result_dir):
         system('rm -rf {}'.format(result_dir))
     circ_mkdir_unix(result_dir)
     for each_dir in self.work_dir:
         each_dir_ignore = path.join(each_dir, '.ignore')
         each_dir_name = path.basename(each_dir)
         each_dir_result = path.join(result_dir, each_dir_name)
         cp_dir_with_ignore(each_dir, each_dir_result)
コード例 #3
0
ファイル: RNAseq_lib.py プロジェクト: zhimenggan/RNAseq-4
def get_diff_splicing_table(rmats_output, out_dir, pvalue=0.05):
    circ_mkdir_unix(out_dir)
    rmats_output_name = path.basename(rmats_output)
    rmats_output_treat = path.join(out_dir, rmats_output_name)
    system("sed -re 's/\"//g' {0} > {1}".format(rmats_output,
                                                rmats_output_treat))
    diff_rmats_out = path.join(out_dir, 'diff.{}'.format(rmats_output_name))
    rmats_output_df = pd.read_table(rmats_output_treat, sep='\t')
    diff_rmats_output_df = rmats_output_df[rmats_output_df.FDR <= 0.05]
    diff_rmats_output_df.to_csv(diff_rmats_out,
                                sep='\t',
                                index=False,
                                na_rep='NA')
コード例 #4
0
ファイル: prepare_pipe.py プロジェクト: zhimenggan/RNAseq-4
    def run(self):
        star_index_dir = path.join(annotation_dir, 'star_index')
        circ_mkdir_unix(star_index_dir)

        star_index_cmd = [
            'STAR', '--runThreadN', STAR_THREAD, '--runMode', 'genomeGenerate',
            '--sjdbOverhang', '149', '--genomeFastaFiles', ref_fa,
            '--sjdbGTFfile', ref_gtf, '--genomeDir', star_index_dir
        ]

        star_index_log_inf = run_cmd(star_index_cmd)

        with self.output().open('w') as star_index_log:
            star_index_log.write(star_index_log_inf)
コード例 #5
0
ファイル: prepare_pipe.py プロジェクト: zhimenggan/RNAseq-4
    def requires(self):
        global ref_fa, genome_dir, ref_gtf, annotation_dir, species_latin, species_ensembl, species_kegg, log_dir, ko_pep_dir, ko_db_dir
        ref_fa, ref_gtf, species_latin = self.ref_fa, self.ref_gtf, self.species_latin
        species_kegg, species_ensembl = get_kegg_biomart_id(species_latin)
        genome_dir = path.dirname(ref_fa)
        annotation_dir = path.dirname(ref_gtf)
        log_dir = path.join(annotation_dir, 'logs')
        kobasrc = config.getrc()
        ko_pep_dir = kobasrc['blastdb']
        ko_db_dir = kobasrc['kobasdb']
        circ_mkdir_unix(log_dir)
        print ko_pep_dir

        return [fa_index(), star_index(), go_annotation(), ko_annotation()]
コード例 #6
0
 def run(self):
     from_dir_name = path.basename(self.from_dir)
     to_dir = path.join(self.to_dir, from_dir_name)
     report_files_ini = path.join(self.from_dir, '.report_files')
     if not path.exists(report_files_ini):
         cp_cmd_inf = 'nothing for report in {}'.format(from_dir_name)
     else:
         circ_mkdir_unix(to_dir)
         cp_cmd = [
             'rsync', '-av', '--files-from={}'.format(report_files_ini),
             self.from_dir, to_dir
         ]
         cp_cmd_inf = run_cmd(cp_cmd)
     with self.output().open('w') as cp_cmd_log:
         cp_cmd_log.write(cp_cmd_inf)
コード例 #7
0
    def run(self):

        analysis_bam_dir = path.join(self.proj_dir, 'mapping', 'bam_dir')
        out_data_dir = path.join(self.proj_dir,
                                 '{}_analysis_data'.format(proj_name))
        out_bam_dir = path.join(out_data_dir, 'bam')
        fq_dir = path.join(out_data_dir, 'fq')

        circ_mkdir_unix(out_data_dir)

        ln_fq_cmd = ['ln', '-s', clean_dir, fq_dir]

        ln_bam_cmd = ['ln', '-s', analysis_bam_dir, out_bam_dir]

        link_cmd_inf = run_cmd([ln_fq_cmd, ln_bam_cmd])

        with self.output().open('w') as get_analysis_data_log:
            get_analysis_data_log.write(link_cmd_inf)
コード例 #8
0
ファイル: run_kegg.py プロジェクト: zhimenggan/RNAseq-4
 def run_KEGG_enrich(self):
     cmd_list = []
     blast_out_dir = path.join(self.out_dir, 'blast_out')
     python_tools.circ_mkdir_unix(blast_out_dir)
     compare_list = listdir(self.diff_dir)
     for each_compare in compare_list:
         each_compare_diff_dir = path.join(self.diff_dir, each_compare)
         diff_gene_list = glob(
             '{}/*.diffgenes.txt'.format(each_compare_diff_dir))
         each_compare_out_dir = path.join(self.out_dir, each_compare)
         python_tools.circ_mkdir_unix(each_compare_out_dir)
         for each_diff_file in diff_gene_list:
             each_diff_file_name = path.basename(each_diff_file)
             each_out_prefix = each_diff_file_name.split(
                 '.edgeR.DE_results')[0]
             each_diff_inf_prefix = each_out_prefix
             if 'UP' not in each_out_prefix:
                 each_diff_inf_prefix = each_out_prefix.split('.')[0]
             each_diff_inf_file = path.join(each_compare_diff_dir,'{}.edgeR.DE_results.txt'.format(each_diff_inf_prefix))
             kegg_output = path.join(
                 each_compare_out_dir, '%s.kegg.enrichment.txt' % (each_out_prefix))
             each_blast_out = path.join(
                 blast_out_dir, '%s.blasttab' % (each_out_prefix))
             extract_each_blast_cmd = 'python %s --id %s --table %s --output %s' % (
                 EXTRACT_INF_BY_ID, each_diff_file, self.all_blast_out, each_blast_out)
             kegg_cmd = self.generate_kobas(each_blast_out, kegg_output)
             python_tools.circ_call_process(extract_each_blast_cmd)
             cmd_list.append(extract_each_blast_cmd)
             if path.exists(each_blast_out):
                 python_tools.circ_call_process(kegg_cmd)
                 cmd_list.append(kegg_cmd)
                 if path.exists(kegg_output):
                     self.treat_KEGG_table(kegg_output)
                     txt_to_excel(kegg_output)
                     #pathway_cmd = self.run_kegg_pathview2(each_compare, each_diff_inf_file)
                     #cmd_list.extend(pathway_cmd)
                 else:
                     cmd_list.append(
                         "## {} not exists!".format(kegg_output))
             else:
                 cmd_list.append("## {} not exists!".format(each_blast_out))
     return cmd_list
コード例 #9
0
import sys
import os
import python_tools

if not len(sys.argv) == 4:
    print 'python ' + sys.argv[0] + ' sample_list qc_dir out_dir'
    sys.exit(0)

sample_list_file = sys.argv[1]
qc_dir = sys.argv[2]
out_dir = sys.argv[3]

sample_list = [each.strip() for each in open(sample_list_file)]
sample_info_dict = {}
reads_quality_dir = os.path.join(out_dir, 'reads_quality')
python_tools.circ_mkdir_unix(reads_quality_dir)
merged_quality_file = os.path.join(reads_quality_dir, 'all.reads_quality.txt')
merged_quality_file_inf = open(merged_quality_file, 'w')
merged_quality_file_inf.write('Sample_ID\tQuality\tCount\tPercent\n')

for each_sample in sample_list:
    sample_info_dict[each_sample] = [0, 0, 0, []]
    reads_quality_dict = {}
    each_reads_quality_file = os.path.join(
        reads_quality_dir, '{}_reads_quality.txt'.format(each_sample))
    for n in (1, 2):
        each_qc_dir = os.path.join(qc_dir,
                                   '%s_%s.clean.fq_fastqc' % (each_sample, n))
        each_qc_file = os.path.join(each_qc_dir, 'fastqc_data.txt')
        with open(each_qc_file) as each_qc_file_info:
            q30_flag = 0
コード例 #10
0
out_dir = sys.argv[4]

compare_list = [each.strip() for each in open(compare_file)]

def run_cmd(cmd):
    p = subprocess.Popen(cmd, shell=False, universal_newlines=True, stdout=subprocess.PIPE)
    ret_code = p.wait()
    output = p.communicate()[0]
    return output

## cp diff table
for each in compare_list:
    each_de_results = os.path.join(diff_dir, 'genes.counts.matrix.{0}.edgeR.DE_results'.format(each))
    each_de_results_dir = os.path.join(out_dir, each)
    if not os.path.exists(each_de_results_dir):
        python_tools.circ_mkdir_unix(each_de_results_dir)
    each_de_results_out_tmp = os.path.join(each_de_results_dir, 'tmp.{0}.edgeR.DE_results.txt'.format(each))
    each_de_results_out = os.path.join(each_de_results_dir, '{0}.edgeR.DE_results.txt'.format(each))
    ## add table header
    cmd_list  = []
    cmd_list.append(['cp', each_de_results, each_de_results_out_tmp])
    cmd_list.append(['python', '/home/lxgui/scripts/diff_table_add_header.py', '--table', each_de_results_out_tmp, '--add_info', 'Gene_ID'])
    ## vocalno plot
    cmd_list.append(['Rscript', '/home/lxgui/scripts/Volcano_Plot_20160406.R', each_de_results_out_tmp, each, each_de_results_dir, '0.001', '2'])
    cmd_list.append(['python', '/home/lxgui/scripts/add_gene_anno_v2.py', each_de_results_out_tmp, gene_anno, each_de_results_out])
    cmd_list.append(['rm', each_de_results_out_tmp])
    each_sub_reg_list = each.split('_vs_')
    for each_sub in each_sub_reg_list:
        name = '%s-UP' % each_sub
        each_sub_diff_result = glob.glob(r'{0}/genes.counts.matrix.{1}.edgeR.DE_results.*.{2}.subset'.format(diff_dir, each, name))[0]
        each_sub_diff_out = os.path.join(each_de_results_dir, '{0}.{1}.subset.txt'.format(each, name))
コード例 #11
0
                            sample_dict[sample_id] = RNAseq_tools.rawdata()
                            sample_dict[sample_id].name = sample_id
                        else:
                            pass
                        if each_fq_path.endswith(
                                'R1.fastq.gz'
                        ) and each_fq_path not in sample_dict[sample_id].read1:
                            sample_dict[sample_id].read1.append(each_fq_path)
                        elif each_fq_path.endswith(
                                'R2.fastq.gz'
                        ) and each_fq_path not in sample_dict[sample_id].read2:
                            sample_dict[sample_id].read2.append(each_fq_path)
                        else:
                            pass

python_tools.circ_mkdir_unix(args.out_dir)
time_info = time.localtime()
output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon,
                                     time_info.tm_mday, time_info.tm_hour,
                                     time_info.tm_min, time_info.tm_sec)

md5_file = os.path.join(args.out_dir, 'fq_md5.txt')
md5_list = []
for each in sample_dict:
    if args.nocheck:
        logging.info('get md5 of %s start' % each)
        log_list = sample_dict[each].get_dna_md5()
        logging.info('get md5 of %s finished' % each)
    else:
        logging.info('check md5 of %s start' % each)
        if args.type == 'dna':
コード例 #12
0
import time
import json
import re

cwd = os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument('--sample_map',
                    help='id map wgc id to sample id',
                    required=True)
parser.add_argument('--out_dir', help='output directory', required=True)
parser.add_argument('--rawdata_list',
                    help='directory list file',
                    required=True)
args = parser.parse_args()

python_tools.circ_mkdir_unix(args.out_dir)
time_info = time.localtime()
output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon,
                                     time_info.tm_mday, time_info.tm_hour,
                                     time_info.tm_min, time_info.tm_sec)

data_dir_list_file = args.rawdata_list
data_dir_list = python_tools.file_to_list(data_dir_list_file)

wgc_to_sample_dict = python_tools.table_to_dict(args.sample_map, 1, 2, False,
                                                '\t')
sample_to_wgc_dict = python_tools.table_to_dict(args.sample_map, 2, 1, False,
                                                '\t')

cp_cmd = os.path.join(cwd, '%s_cp.sh' % output_time)
cp_data_info_file = os.path.join(cwd, '%s_rawdata.info' % output_time)
コード例 #13
0
import sys
import os
import argparse
import re
import python_tools

parser = argparse.ArgumentParser()
parser.add_argument('--maf_fasta', help='maf file.', required=True)
#parser.add_argument('--species', help = 'maf species.', required = True)
parser.add_argument('--out_dir', help='output directory', required=True)
args = parser.parse_args()

out_dir = os.path.abspath(args.out_dir)

if not os.path.exists(out_dir):
    python_tools.circ_mkdir_unix(out_dir)

id_dict = {}

outfile_list = os.path.join(out_dir, 'maf_fasta.list')
outfile_list_info = open(outfile_list, 'w')

with open(args.maf_fasta) as maf_fasta_info:
    for eachline in maf_fasta_info:
        eachline = eachline.strip()
        if '>' in eachline:
            header = re.sub('.TU', '|TU', eachline)
            tr_id = header.split('|')[1]
            if tr_id not in id_dict:
                id_dict[tr_id] = 1
                each_tr_out_file = os.path.join(out_dir, '%s.fa' % tr_id)
コード例 #14
0
ファイル: sRNA.qc.py プロジェクト: zhimenggan/pipe_v1.1
from __future__ import division
import sys
import os
import python_tools

if not len(sys.argv) == 4:
    print '    print ' + sys.argv[
        0] + ' sRNA.analsys.dir sRNA.qc.summary sRNA.length.dir'
    sys.exit(1)

sRNA_analysis_dir = sys.argv[1]
out_qc_summary = sys.argv[2]
sRNA_length_dir = sys.argv[3]

python_tools.circ_mkdir_unix(sRNA_length_dir)
sRNA_ori_summary_file = os.path.join(sRNA_analysis_dir, 'SampleSummary.xls')
sRNA_data_dict = {}

with open(sRNA_ori_summary_file) as sRNA_ori_summary_file_inf:
    for n, eachline in enumerate(sRNA_ori_summary_file_inf):
        if n != 0:
            eachline_inf = eachline.strip().split('\t')
            sample_id = eachline_inf[0]
            total_reads = int(eachline_inf[4])
            mapped_reads = int(eachline_inf[5])
            # mapping_rate = 100*round(mapped_reads/mapped_reads, 2)
            # mapping_rate_rep = '%s%%' mapping_rate
            sRNA_data_dict[sample_id] = [total_reads, mapped_reads]

sample_length_merged_file = os.path.join(sRNA_length_dir, 'Sample.length.txt')
sample_length_merged_file_inf = open(sample_length_merged_file, 'w')
コード例 #15
0
                    required=True)
parser.add_argument('--seq_dir_name',
                    help='sequencing data names, sep with ","',
                    required=True)
parser.add_argument('--analysis_data_dir',
                    help='directory store analysis data',
                    required=True)
parser.add_argument('--sample_map',
                    help='file map sample id to wgc id',
                    required=True)
args = parser.parse_args()

sample_map_dict = python_tools.table_to_dict(args.sample_map, 1, 2, False,
                                             '\t')

python_tools.circ_mkdir_unix(args.analysis_data_dir)

sample_data_dict = {}
seq_dir_name_list = args.seq_dir_name.split(',')
for each_name in seq_dir_name_list:
    each_dir = os.path.join(args.seq_data_dir, each_name)
    each_dir_files = os.listdir(each_dir)
    for each_file in each_dir_files:
        each_file_path = os.path.join(each_dir, each_file)
        if os.path.isdir(each_file_path):
            wgc_id = each_file
            sample_id = sample_map_dict[wgc_id]
            seq_files = os.listdir(each_file_path)
            if sample_id not in sample_data_dict:
                sample_data_dict[sample_id] = RNAseq_tools.rawdata()
                sample_data_dict[sample_id].name = sample_id
コード例 #16
0
ファイル: go_annotate.py プロジェクト: zhimenggan/pipe_v1.1
    print '    python ' + sys.argv[
        0] + ' compare_list_file enrich_dir go_file out_dir'
    sys.exit(0)

compare_list_file = sys.argv[1]
enrich_dir = sys.argv[2]
go_file = sys.argv[3]
out_dir = sys.argv[4]

compare_list = [each.strip() for each in open(compare_list_file)]

for each in compare_list:
    each_enrich = os.path.join(enrich_dir, each)
    each_enrich_out = os.path.join(out_dir, each)
    if not os.path.exists(each_enrich_out):
        python_tools.circ_mkdir_unix(each_enrich_out)
    reg_list = ['ALL']
    each_sub_reg_list = each.split('_vs_')
    reg_list.extend(each_sub_reg_list)
    for each_sub in reg_list:
        name = each_sub
        if each_sub != 'ALL':
            name = '%s-UP' % each_sub
        each_diff_file1 = os.path.join(each_enrich, '%s-target.list' % name)
        each_diff_file2 = os.path.join(each_enrich, '%s.list' % name)
        if os.path.exists(each_diff_file1):
            each_diff_file = each_diff_file1
        else:
            each_diff_file = each_diff_file2
        each_go_file1 = os.path.join(each_enrich,
                                     '%s.%s.GO.enrich.xls' % (each, name))