Example #1
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/phylotree'])
	import merge_count, make_phylotree

	return [
		{
		'name': 'merge counts',
		'desc': 'merge mutation loci and allele counts',
		'fun': merge_count.merge_count,
		'paramL': (baseDir, baseDir, 5, 0.05, 5),
#		'paramL': (baseDir, baseDir, 20, 0.2, 5),
		'paramH': {},
		'logPostFix': '.merge_count.log',
		'logExistsFn': lambda x: 'done' in x[-1],
		'outFilePostFix': ['.mutations','.filtered'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'build tree',
		'desc': 'make phylogenetic tree',
		'fun': make_phylotree.main,
		'paramL': (baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.make_phylotree.log',
		'logExistsFn': lambda x: 'done' in x[-1],
		'outFilePostFix': ['.infile', '.outfile','.tree','.pars_tree.pdf','.outfile_report.txt'],
		'clean': False,
		'rerun': False
		},
	]
Example #2
0
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/mutation'])
    import mutect_batch, somaticindeldetector_batch

    return [  ## PARAMTERS
        {
            'name': 'Run MuTect',
            'desc': '.recal.bam -> .mutect, mutect.vcf',
            'fun': mutect_batch.mutect_pair,
            'paramL': (baseDir, baseDir, genome, server, False),
            'paramH': {},
            'logPostFix': '.mutect_pair.log',
            'logExistsFn': lambda x: 'done' in x[-9],
            'outFilePostFix': ['.mutect', '.mutect_pair.vcf'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'somaticindeldetector',
            'desc': '.recal.bam -> indels_filter.vcf',
            'fun': somaticindeldetector_batch.paired_mode,
            'paramL': (baseDir, baseDir, 'SS', genome, server, False),
            'paramH': {},
            'logPostFix': '.somaticindeldetector_pair.log',
            'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
            'outFilePostFix':
            ['indels_pair_filter.vcf', 'indels_pair_filter.out'],
            'clean': False,
            'rerun': False
        },
    ]
Example #3
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/copynumber'])
	import	cn_corr_batch, corrcgh2seg_batch, drawCNATraj_batch, corrseg2gene_batch

	return [ ## PARAMETERS
		{
		'name': 'copy number correction',
		'desc': 'ngCGH -> corr.ngCGH',
		'fun': cn_corr_batch.main,
		'paramL': (baseDir, baseDir, False, server),
		'paramH': {},
		'logPostFix': '.cn_corr.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['corr.ngCGH'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Segmenation',
		'desc': 'corr.ngCGH -> corr.ngCGH.seg',
		'fun': corrcgh2seg_batch.cgh2seg,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.corr.seg.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1],
		'outFilePostFix': ['corr.ngCGH.seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from segments',
		'desc': 'corr.seg -> corr.cn_gene.dat',
		'fun': corrseg2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False),
		'paramH': {},
		'logPostFix': '.corr.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['corr.cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Plot corrected segmentation',
		'desc': 'Plot segmentations for corrected copy number profile',
#		'fun': drawCNATraj.main,
#		'paramL': (baseDir, baseDir),
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, baseDir, genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},

		]
Example #4
0
def genSpec_single(baseDir, server="smc1", genome="hg19"):
    mybasic.add_module_path(["NGS/mutation"])
    import mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            "name": "Run MuTect (single)",
            "desc": ".recal.bam -> .mutect, mutect_single_filter.vcf",
            "fun": mutect_batch.mutect_PON,
            "paramL": (baseDir, genome, server, False),
            "paramH": {},
            "logPostFix": ".mutect_single.log",
            "logExistsFn": lambda x: "done" in x[-9],
            "outFilePostFix": ["mutect_single_filter.vcf"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "somaticindeldetector",
            "desc": ".recal.bam -> indels_single_filter.vcf",
            "fun": somaticindeldetector_batch.single_mode,
            "paramL": (baseDir, baseDir, "SS", genome, server, False),
            "paramH": {},
            "logPostFix": ".somaticindeldetector_single.log",
            "logExistsFn": lambda x: ("chrX" in x[-1] or "chrX" in x[-2]),
            "outFilePostFix": ["indels_single_filter.vcf", "indels_single_filter.out"],
            "clean": False,
            "rerun": False,
        },
    ]
Example #5
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/copynumber'])
	import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch

	return [ ## PARAMETERS
		{
		'name': 'run ngCGH for pairs of bam',
		'desc': 'bam -> .ngCGH',
		'fun': ngCGH_batch.main,
		'paramL': (baseDir, baseDir, 1000, False),
		'paramH': {},
		'logPostFix': '.cn_ngCGH.log',
		'logExistsFn': lambda x: len(x)>0 and 'finalizers' in x[-1],
		'outFilePostFix': ['ngCGH'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Segmenation',
		'desc': 'ngCGH -> seg',
		'fun': cgh2seg_batch.cgh2seg,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.seg.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1],
		'outFilePostFix': ['ngCGH.seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from segments',
		'desc': 'seg -> cn_gene.dat',
		'fun': seg2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False),
		'paramH': {},
		'logPostFix': '.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Draw Plot',
		'desc': 'seg->plot',
#		'fun' : drawCNATraj_batch.batch,
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},

		]
Example #6
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/splice_gsnap/ei_junc'])
	import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch## MODULES

	return [ ## PARAMETERS
#		{
#		'name': 'bam to fastq',
#		'desc': 'bam -> fastq',
#		'fun': bam2fastq_batch2.bam2fastq_batch2,
#		'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'),
#		'paramH': {},
#		'logPostFix': 'fastq.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1],
#		'outFilePostFix': ['fastq'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'Align',
#		'desc': 'fastq -> splice.gsnap',
#		'fun': gsnap_splice_batch.align,
#		'paramL':(baseDir, baseDir, 6, False, False),
#		'paramH': {},
#		'logPostFix': 'gsnap.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
#		'outFilePostFix': ['splice.gsnap'],
#		'clean': False,
#		'rerun': False
#		},

		{
		'name': 'Filter eiJunc',
		'desc': 'splice.gsnap.gz -> ei.dat',
		'fun': ei_junc_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.ei.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Finished' in x[-1],
		'outFilePostFix': ['ei.dat'],
		'clean': False,
		'rerun': False 
		},

		]
Example #7
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(
        ['NGS/fastq', 'NGS/align', 'NGS/splice_gsnap/ei_junc'])
    import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'bam to fastq',
        #		'desc': 'bam -> fastq',
        #		'fun': bam2fastq_batch2.bam2fastq_batch2,
        #		'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'),
        #		'paramH': {},
        #		'logPostFix': 'fastq.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1],
        #		'outFilePostFix': ['fastq'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False, False),
        #		'paramH': {},
        #		'logPostFix': 'gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Filter eiJunc',
            'desc': 'splice.gsnap.gz -> ei.dat',
            'fun': ei_junc_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.ei.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Finished' in x[-1],
            'outFilePostFix': ['ei.dat'],
            'clean': False,
            'rerun': False
        },
    ]
Example #8
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/mutation'])
	import	mut_clonality_batch

	return [ ## PARAMETERS
		{
		'name': 'determine mutation clonality',
		'desc': 'mutect -> mutect_cl.dat',
		'fun': mut_clonality_batch.main,
		'paramL': (baseDir, baseDir, mysetting.cnaBaseDir, False, server),
		'paramH': {},
		'logPostFix': '.mutect_cl.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['mutect_cl.dat'],
		'clean': False,
		'rerun': False
		},

		]
Example #9
0
def main(datFileN, server='smc1', dbN='CancerSCAN'):
    mybasic.add_module_path(['NGS/mutation', 'Integration'])

    import vep_batch, makeDB_mutation_rxsq
    print mysetting.CSmutDir + '/*CS'
    vep_batch.main(glob(mysetting.CSmutDir + '/*CS'),
                   postfixL=[
                       '.mutect_filter.vcf', '.mutect_single_filter.vcf',
                       '.indels_filter.vcf', '.indels_single_filter.vcf'
                   ],
                   fork=True)

    os.system(
        'cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s'
        % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN))
    mymysql.reset_table(tableN='mutation_cs',
                        dataFileN=datFileN,
                        user=mysetting.mysqlH[server]['user'],
                        passwd=mysetting.mysqlH[server]['passwd'],
                        db=dbN,
                        host=mysetting.mysqlH[server]['host'])

    (con,
     cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],
                                 passwd=mysetting.mysqlH[server]['passwd'],
                                 db=dbN,
                                 host=mysetting.mysqlH[server]['host'])
    sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir + '/' + x),
                    os.listdir(mysetting.CSmutDir))
    for sampN in sampNL:
        id = '_'.join(sampN.split('_')[:-2])
        postfix = sampN.split('_')[-2]
        if postfix == 'B':
            continue
        if postfix != 'T':
            id = '%s_%s' % (id, postfix)
        cursor.execute(
            '''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' %
            id)
        cursor.execute(
            '''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/phylotree'])
    import merge_count, make_phylotree

    return [
        {
            'name': 'merge counts',
            'desc': 'merge mutation loci and allele counts',
            'fun': merge_count.merge_count,
            'paramL': (baseDir, baseDir, 5, 0.05, 5),
            #		'paramL': (baseDir, baseDir, 20, 0.2, 5),
            'paramH': {},
            'logPostFix': '.merge_count.log',
            'logExistsFn': lambda x: 'done' in x[-1],
            'outFilePostFix': ['.mutations', '.filtered'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'build tree',
            'desc':
            'make phylogenetic tree',
            'fun':
            make_phylotree.main,
            'paramL': (baseDir, baseDir),
            'paramH': {},
            'logPostFix':
            '.make_phylotree.log',
            'logExistsFn':
            lambda x: 'done' in x[-1],
            'outFilePostFix': [
                '.infile', '.outfile', '.tree', '.pars_tree.pdf',
                '.outfile_report.txt'
            ],
            'clean':
            False,
            'rerun':
            False
        },
    ]
Example #11
0
def main(datFileN, server='smc1', dbN='CancerSCAN'):
	mybasic.add_module_path(['NGS/mutation','Integration'])

	import vep_batch, makeDB_mutation_rxsq
	print mysetting.CSmutDir+'/*CS'
	vep_batch.main(glob(mysetting.CSmutDir+'/*CS'), postfixL=['.mutect_filter.vcf','.mutect_single_filter.vcf','.indels_filter.vcf','.indels_single_filter.vcf'], fork=True)

	os.system('cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s' % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN))
	mymysql.reset_table(tableN='mutation_cs', dataFileN=datFileN, user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])

	(con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])
	sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir+'/'+x), os.listdir(mysetting.CSmutDir))
	for sampN in sampNL:
		id = '_'.join(sampN.split('_')[:-2])
		postfix = sampN.split('_')[-2]
		if postfix == 'B':
			continue
		if postfix != 'T':
			id = '%s_%s' % (id, postfix)
		cursor.execute('''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' % id)
		cursor.execute('''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
Example #12
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/mutation','NGS/loh','NGS/purity'])
	import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch ## MODULES

	return [ ## PARAMETERS
		{
		'name': 'MutScan for the tumor sample',
		'desc': 'pileup_proc -> loh.mutscan',
		'fun': mutScan_loh_batch.main,
		'paramL': (baseDir, baseDir, False, 10, 0, 0),
		'paramH': {},
		'logPostFix': '.loh.mutscan.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['loh.mutscan'],
		'clean': False,
		'rerun': False
		},
		
		{
		'name': 'delta B-allele frequencies calculation',
		'desc': 'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample',
		'fun': delta_baf_mutscan_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dbaf.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['dbaf.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'delta BAF segmentation',
		'desc': 'segment delta BAF',
		'fun': delta_baf_seg_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dbaf.seg.log',
		'logExistsFn': lambda x: len(x)>0 and 'Analyzing' in x[-1],
		'outFilePostFix': ['seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Plotting',
		'desc': 'Generate deltaBAF/CN trajectory plot',
		'fun': dbaf_cn_plot_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.traj_plot.log',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1],
		'outFilePostFix': ['dBAF_CNA_traj.pdf'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'CNLOH/LOH determination',
		'desc': 'calculate average copy number of LOH segments to determine CNLOH/LOH',
		'fun': calcCN_LOH_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.loh_cn.log',
		'logExistsFn': lambda x: len(x)>0 and 'Setting' in x[-1],
		'outFilePostFix': ['loh_cn.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'gene LOH',
		'desc': 'loh_cn.txt -> loh_gene.dat',
		'fun': loh2gene_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.refFlatH[server][genome]),
		'paramH': {},
		'logPostFix': '.loh_gene.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['loh_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Normal contamiation calculation',
		'desc': 'calculate normal contamination levels at heterozygous germline SNPs in LOH regions',
		'fun': calcNormalF_loh_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.nfrac_all.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['nFrac_all.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Tumor fraction estimation',
		'desc': 'estimate tumor fraction',
		'fun': peakFrac_batch.main,
		'paramL': (baseDir, baseDir,False),
		'paramH': {},
		'logPostFix': '.tfrac.log',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1],
		'outFilePostFix': ['tumor_frac.txt'],
		'clean': False,
		'rerun': False
		},

		]
Example #13
0
#!/usr/bin/python
## postprocessing for RNA-Seq pipelines : rsq2skip, rsq2fusion, rsq2eiJunc
## handles 3 pipeline at the same time: no need to run 3 times after each pipeline

from glob import glob
import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
from datetime import datetime
from warnings import filterwarnings
from warnings import resetwarnings

mybasic.add_module_path(['NGS/splice_gsnap/skipping','NGS/splice_gsnap/fusion','NGS/splice_gsnap/ei_junc','Integration'])
import makeDB_splice_AF
import prepDB_splice_normal, exonSkip_summarize, prepDB_splice_skip
import fusion_summarize, prepDB_splice_fusion
import ei_junc_filter, prepDB_splice_eiJunc

BASE='/EQL1/NSL/RNASeq/results'
RSQPattern=('(.*)_RSq','')

def post_rsq2skip(dirN, server='smc1', dbN='ihlee_test', sampL=[]):
	(con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host'])
	cursor.execute('ALTER TABLE splice_normal CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('ALTER TABLE splice_normal_loc1 CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('ALTER TABLE splice_normal_loc2 CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('CREATE TEMPORARY TABLE splice_normal_tmp LIKE splice_normal')
	sampNL = filter(lambda x: os.path.isdir(dirN + '/' + x), os.listdir(dirN))
	for sampN in sampNL:
		baseDir = dirN + '/' + sampN
		sid = sampN[:-4].replace('.','_').replace('-','_') ## RNASeq sample has '***_RSq'
Example #14
0
#!/usr/bin/python

import sys, os, glob, getopt
import mybasic, mysetting

mybasic.add_module_path(['utils'])

import link_fqgz_hj

# linking
link_fqgz_hj.link('/EQL1/NSL/WXS/fastq','/EQL1/NSL/WXS/exome_20130529', '.*([0-9]{3})[ITN].*')


# listing directories
dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*')

#for dir_name in dir_list

def main(pbs=False):

	print dir_list, len(dir_list)
	projectName = 'heejin_20'
	os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName)

	for single_dir in dir_list:

		sampN = single_dir.split('/')[-1]

#		if sampN not in ['S012_T_SS']:
#			continue
Example #15
0
	annotH = {}
	for line in inFile:
		colL = line.rstrip().split('\t')
		rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']])
		(chr,chrSta,chrEnd) = rm.groups()
		ref = colL[idxH['ref']]
		alt = colL[idxH['alt']]
		if (chr,chrSta,chrEnd,ref,alt) not in annotH:
			annotH[(chr,chrSta,chrEnd,ref,alt)] = {}
			for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']:
				annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]]
	return annotH

### until it is merged into pipeline
import mybasic
mybasic.add_module_path(['NGS/pipeline'])
import mypipe
#bamDirL = mysetting.wxsBamDirL
#trioH = mypipe.read_trio(bamDirL=bamDirL)
#pairH = {}
#for tid in trioH:
#	if tid not in ['37']:
#		continue
#	if trioH[tid]['recur_id'] != []:
#		print tid, trioH[tid]['prim_id']
#		print tid, trioH[tid]['recur_id']
#		pid = re.match('(.*)_T.{,2}_[TS]{2}', trioH[tid]['prim_id'][0]).group(1)
#		pairH[pid] = map(lambda x: re.match('(.*)_T.{,2}_[TS]{2}',x).group(1), trioH[tid]['recur_id'])

inDir = '/EQL3/pipeline/somatic_mutect/'
outDir = '/EQL1/PrimRecur/phylogeny'
Example #16
0
#!/usr/bin/python

import sys, os, re, getopt, glob
import mybasic

mybasic.add_module_path(["NGS/align", "NGS/mutation"])

import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch


def wxs_seq(baseDir, projectName):

    current_files_list = []
    compared_files_list = []
    current_files_list = glob.glob(baseDir + "/*")

    # compose log string
    html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

    # prep html file
    html_path = "/var/www/html/pipeline_logs/" + projectName + "/"
    file_name_split = baseDir.split("/S")
    sample_name = "S" + file_name_split[1]
    file_name = "pipeline1_log_" + sample_name + ".html"
    # create .html file
    with open(os.path.join(html_path, file_name), "wb") as log_file:
        log_file.write(html_head_string)
    log_file.close()

    # change mode and open log_file again
    os.system("chmod 755 %s%s" % (html_path, file_name))
Example #17
0
def genSpec_CS(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/coverage','NGS/expression','NGS/copynumber'])
	import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch

	return [ ## PARAMTERS
		{
		'name': 'Format Conversion and sorting',
		'desc': 'bam -> sort -> sorted.bed',
		'fun': bam2sortedBed_batch.sam2bed_batch,
		'paramL': (baseDir, baseDir, 'recal', False),
		'paramH': {},
		'logPostFix': '.sorted.bed.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['sorted.bed'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RPKMgen',
		'desc': 'sorted.bed -> rpkm',
		'fun': degSeq_batch.main,
		'paramL': (baseDir, baseDir, '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False),
		'paramH': {},
		'logPostFix': '.degSeq.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'omitted' in x[-1],
		'outFilePostFix': ['rpkm'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate a log2 rpkm ratio for all exons',
		'desc': 'log2(tumor rpkm/normal rpkm',
		'fun': rpkm2cn_batch.main_pool,
		'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False),
		'paramH': {},
		'logPostFix': '.cn.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['copynumber'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from log2 rpkm ratios',
		'desc': 'copynumber -> cn_gene.dat',
		'fun': exon2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],mysetting.cs_gene,False),
		'paramH': {},
		'logPostFix': '.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'VHL' in x[-1],
		'outFilePostFix': ['cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Draw Plot',
		'desc': 'seg->plot',
#		'fun' : drawCNATraj_batch.batch,
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},
	]
Example #18
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch  ## MODULES
    import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'BWA',
            'desc':
            'fq -> sam -> bam -> sorted.bam',
            'fun':
            bwa_batch.align,
            'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000,
                       False, mysetting.bwaIndexH[server][genome], True),
            'paramH': {},
            'logPostFix':
            '.bwa.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'bam_sort_core' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam -> RG.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['RG.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Realign',
            'desc':
            'RG.bam -> realign.bam -> recal.bam',
            'fun':
            realign_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },

        #		{
        #		'name': 'Pileup',
        #		'desc': 'recal.bam -> pileup',
        #		'fun': pileup_batch.main,
        #		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]),
        #		'paramH': {},
        #		'logPostFix': '.pileup.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1],
        #		'outFilePostFix': ['pileup'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Pileup_proc',
            'desc': 'recal.bam -> pileup -> pileup_proc',
            'fun': procPileup_split_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.ucscRefH[server][genome], False),
            'paramH': {},
            'logPostFix': '.pileup_proc.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['pileup_proc', 'pileup.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'MutScan',
            'desc': 'pileup_proc -> mutscan',
            'fun': mutScan_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.mutscan.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['mutscan'],
            'clean': False,
            'rerun': False
        },

        # temporarily off
        #		{
        #		'name': 'MuTect',
        #		'desc': 'recal.bam -> .vcf',
        #		'fun': mutect_batch.mutect_PON,
        #		'paramL': (baseDir, genome, server, False),
        #		'paramH': {},
        #		'logPostFix': '.mutect_single.log',
        #		'logExistsFn': lambda x: 'done' in x[-9],
        #		'outFilePostFix': ['.mutect.vcf','.mutect'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'SomaticIndelDetector',
        #		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
        #		'fun': somaticindeldetector_batch.single_mode,
        #		'paramL': (baseDir, baseDir, 'SS', genome, server, False),
        #		'paramH': {},
        #		'logPostFix': '.somaticindeldetector.log',
        #		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
        #		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{## old cosmic join
        #		'name': 'mutscan_snp_cosmic',
        #		'desc': 'mutscan -> cosmic.dat',
        #		'fun': mutscan_snp_cosmic_batch.main,
        #		'paramL': (baseDir, server),
        #		'paramH': {},
        #		'logPostFix': '.cosmic.log',
        #		'logExistsFn': lambda x: len(x) == 0,
        #		'outFilePostFix': ['cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'VEP annotation',
        #		'desc': 'Annotate mutscan output',
        #		'fun': vep_mutscan_batch.main,
        #		'paramL': ([baseDir]),
        #		'paramH': {},
        #		'logPostFix': '.mutscan_vep.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['mutscan_vep_out.vcf'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ## join cosmic
        #		{
        #		'name': 'Join Cosmic',
        #		'desc': 'Join annotated mutscan output with COSMIC',
        #		'fun': annotate_join_cosmic_batch.main,
        #		'paramL': (baseDir, '(.*)\.vep$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Example #19
0
#!/usr/bin/python
## integration into DB (per sample)

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['NGS/expression','Integration'])
import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene

def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'):
	sampN = baseDir.split('/')[-1]
	sid = sampN[:-4].replace('-','_').replace('.','_') ##drop '_RSq'

	if dbN in ['ihlee_test','ircr1']:
		gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN
		datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN
	else:
		gctFileN = '%s/%s.gct' % (baseDir, sampN)
		datFileN = '%s/%s.dat' % (baseDir, sampN)
	print sampN, gctFileN
	rpkm_process.rpkm_process(inputDirN=baseDir, filePattern='*.rpkm', sampRegex='(.*)_RSq\.rpkm', outputFileN=gctFileN)
	## prep
	prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN, geneList=[], samplePrefix='', outDatFileName=datFileN)
	## import
	(con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host'])
	cursor.execute('DELETE FROM rpkm_gene_expr WHERE samp_id="%s"' % sid)
	cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE rpkm_gene_expr' % datFileN)
	cursor.execute('DROP VIEW IF EXISTS rpkm_gene_expr_lg2')
	cursor.execute('CREATE VIEW rpkm_gene_expr_lg2 AS SELECT samp_id,gene_sym,log2(rpkm+1) AS lg2_rpkm FROM rpkm_gene_expr')
	## make sure to update sample_tag that this sample has RNA-Seq
	cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
Example #20
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/align','NGS/splice_gsnap/fusion'])
	import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch ## MODULES

	return [ ## PARAMETERS
#		{
#		'name': 'Align',
#		'desc': 'fastq -> splice.gsnap',
#		'fun': gsnap_splice_batch.align,
#		'paramL':(baseDir, baseDir, 6, False),
#		'paramH': {},
#		'logPostFix': '.gsnap.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
#		'outFilePostFix': ['splice.gsnap'],
#		'clean': False,
#		'rerun': False
#		},
#
		{
		'name': 'Filter transloc',
		'desc': 'splice.gsnap.gz -> splice_transloc.gsnap',
		'fun': fusion_filter_transloc_batch.fusion_filter_batch,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.ft_tloc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Results' in x[-1],
		'outFilePostFix': ['splice_transloc.gsnap'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'annotate',
		'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap',
		'fun': fusion_filter_annot1_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.annot.qlog',
		'logExistsFn': lambda x: len(x)>1 and 'Results' in x[-1],
		'outFilePostFix': ['splice_transloc_annot1.gsnap'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'sort',
		'desc': 'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt',
		'fun': fusion_proc_sort_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.sort.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['splice_transloc_annot1.sorted.gsnap','splice_transloc_annot1.report.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'annotate report',
		'desc': 'report.txt -> report_annot.txt',
		'fun': fusion_proc_annot_batch.fusion_proc_annot_batch,
		'paramL': (baseDir, baseDir, None, False),
		'paramH': {},
		'logPostFix': '.report_annot.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'],
		'clean': False,
		'rerun': False
		},
		
#		{
#		'name': 'Summarize',
#		'desc': '',
#		'fun': ,
#		'paramL': (baseDir, baseDir, False),
#		'paramH': {},
#		'logPostFix': 'realign.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
#		'outFilePostFix': ['realign.bam', 'recal.bam'],
#		'clean': False,
#		'rerun': False
#		},

		]
Example #21
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch  ## MODULES
    import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'BWA',
            'desc':
            'fq -> sam -> bam -> sorted.bam',
            'fun':
            bwa_batch.align,
            'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000,
                       False, mysetting.bwaIndexH[server][genome], True),
            'paramH': {},
            'logPostFix':
            '.bwa.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'bam_sort_core' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean':
            True,
            'rerun':
            False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['dedup.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Realign',
            'desc':
            'dedup.bam -> realign.bam -> recal.bam',
            'fun':
            realign_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'MuTect',
            'desc': 'recal.bam -> .vcf',
            'fun': mutect_batch.mutect_PON,
            'paramL': (baseDir, genome, server, False),
            'paramH': {},
            'logPostFix': '.mutect_single.log',
            'logExistsFn': lambda x: 'done' in x[-9],
            'outFilePostFix': ['.mutect.vcf', '.mutect_filter.vcf', '.mutect'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'SomaticIndelDetector',
            'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
            'fun': somaticindeldetector_batch.single_mode,
            'paramL': (baseDir, baseDir, 'CS', genome, server, False),
            'paramH': {},
            'logPostFix': '.somaticindeldetector.log',
            'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
            'outFilePostFix': ['indels_filter.vcf', 'indels_filter.out'],
            'clean': False,
            'rerun': False
        },

        #		{ ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?)
        #		'name': 'VEP',
        #		'desc': '.vcf -> .dat',
        #		'fun': vep_mutect_batch.main,
        #		'paramL': ([baseDir], False),
        #		'paramH': {},
        #		'logPostFix': 'mutect_vep.log',
        #		'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['_vep.dat'],
        #		'clean': False,
        #		'rerun': False
        #		}

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Example #22
0
#!/usr/bin/python

import sys, os, glob, getopt
import mybasic, mysetting

mybasic.add_module_path(['utils'])

import link_fqgz_hj

# linking
link_fqgz_hj.link('/EQL1/NSL/WXS/fastq', '/EQL1/NSL/WXS/exome_20130529',
                  '.*([0-9]{3})[ITN].*')

# listing directories
dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*')

#for dir_name in dir_list


def main(pbs=False):

    print dir_list, len(dir_list)
    projectName = 'heejin_20'
    os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName)

    for single_dir in dir_list:

        sampN = single_dir.split('/')[-1]

        #		if sampN not in ['S012_T_SS']:
        #			continue
Example #23
0
#!/usr/bin/python

from glob import glob
import sys, os, re
import mysetting, mymysql, mypipe, mybasic
mybasic.add_module_path(['Integration', 'NGS/mutation'])
import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch


def prep_single(outFileN, server='smc1', dbN='ircr1'):
    (con,
     cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],
                                 passwd=mysetting.mysqlH[server]['passwd'],
                                 db=dbN,
                                 host=mysetting.mysqlH[server]['host'])
    cosmicL = []
    for dir in mysetting.wxsMutscanDirL:
        cosmicL += filter(
            lambda x: '_B_' not in x,
            glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir))

    cursor.execute(
        'SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"')
    results = cursor.fetchall()
    sidL = []
    for res in results:
        sidL.append(res[0])
    for cosmic in cosmicL:
        (sid, postfix,
         platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat',
                              os.path.basename(cosmic)).groups()
Example #24
0
def genSpec(baseDir, server="smc1", genome="hg19"):

    mybasic.add_module_path(["NGS/align", "NGS/splice_gsnap/skipping"])
    import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        # 		{
        # 		'name': 'Align',
        # 		'desc': 'fastq -> splice.gsnap',
        # 		'fun': gsnap_splice_batch.align,
        # 		'paramL':(baseDir, baseDir, 6, False),
        # 		'paramH': {},
        # 		'logPostFix': 'gsnap.qlog',
        # 		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        # 		'outFilePostFix': ['splice.gsnap'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
        {
            "name": "Filter exonskip",
            "desc": "splice.gsnap.gz -> splice_exonSkip.gsnap",
            "fun": exonSkip_filter_batch.exonSkip_filter_batch,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".exonSkip.qlog",
            "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1],
            "outFilePostFix": ["splice_exonSkip.gsnap"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "Filter normal exonskip",
            "desc": "splice.gsnap -> splice_exonSkip_normal.gsnap.gz",
            "fun": exonSkip_filter_normal_batch.exonSkip_filter_batch,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".exonSkip_normal.qlog",
            "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1],
            "outFilePostFix": ["splice_exonSkip_normal.gsnap.gz"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "sort",
            "desc": "splice_exonSkip.gsnap -> splice_exonSkip_report.txt",
            "fun": exonSkip_sort_batch.main,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".sort.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_report.txt"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "sort-normal",
            "desc": "splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt",
            "fun": exonSkip_normal_sort_batch.main,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".sort_normal.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_normal_report.txt"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "annotate report",
            "desc": "report.txt -> report_annot.txt",
            "fun": exonSkip_proc_annot_batch.exonSkip_proc_annot_batch,
            "paramL": (baseDir, baseDir, None, False),
            "paramH": {},
            "logPostFix": ".skip_annot.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_report_annot.txt"],
            "clean": False,
            "rerun": False,
        },
        # 		{
        # 		'name': 'link',
        # 		'desc': 'put all report_annot.txt files in a directory',
        # 		'fun': exonSkip_link.link,
        # 		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'),
        # 		'paramH': {},
        # 		'logPostFix': 'link.qlog',
        # 		'logExistsFn': lambda x: len(x)==0,
        # 		'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
        #
        # 		{
        # 		'name': 'link-normal',
        # 		'desc': 'put all report_normal.txt files in a directory',
        # 		'fun': exonSkip_link_normal.link,
        # 		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'),
        # 		'paramH': {},
        # 		'logPostFix': 'link_normal.qlog',
        # 		'logExistsFn': lambda x: len(x)==0,
        # 		'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
    ]
Example #25
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch ## MODULES
	import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch

	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'BWA',
		'desc': 'fq -> sam -> bam -> sorted.bam',
		'fun': bwa_batch.align,
		'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True),
		'paramH': {},
		'logPostFix': '.bwa.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam -> RG.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['RG.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Realign',
		'desc': 'RG.bam -> realign.bam -> recal.bam',
		'fun': realign_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['recal.bam'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'Pileup',
#		'desc': 'recal.bam -> pileup',
#		'fun': pileup_batch.main,
#		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]),
#		'paramH': {},
#		'logPostFix': '.pileup.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1],
#		'outFilePostFix': ['pileup'],
#		'clean': False,
#		'rerun': False
#		},

		{
		'name': 'Pileup_proc',
		'desc': 'recal.bam -> pileup -> pileup_proc',
		'fun': procPileup_split_batch.main,
		'paramL': (baseDir, baseDir, mysetting.ucscRefH[server][genome], False),
		'paramH': {},
		'logPostFix': '.pileup_proc.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['pileup_proc','pileup.gz'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MutScan',
		'desc': 'pileup_proc -> mutscan',
		'fun': mutScan_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.mutscan.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['mutscan'],
		'clean': False,
		'rerun': False
		},

# temporarily off
#		{
#		'name': 'MuTect',
#		'desc': 'recal.bam -> .vcf',
#		'fun': mutect_batch.mutect_PON,
#		'paramL': (baseDir, genome, server, False),
#		'paramH': {},
#		'logPostFix': '.mutect_single.log',
#		'logExistsFn': lambda x: 'done' in x[-9],
#		'outFilePostFix': ['.mutect.vcf','.mutect'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'SomaticIndelDetector',
#		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
#		'fun': somaticindeldetector_batch.single_mode,
#		'paramL': (baseDir, baseDir, 'SS', genome, server, False),
#		'paramH': {},
#		'logPostFix': '.somaticindeldetector.log',
#		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
#		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
#		'clean': False,
#		'rerun': False
#		},

#		{## old cosmic join
#		'name': 'mutscan_snp_cosmic',
#		'desc': 'mutscan -> cosmic.dat',
#		'fun': mutscan_snp_cosmic_batch.main,
#		'paramL': (baseDir, server),
#		'paramH': {},
#		'logPostFix': '.cosmic.log',
#		'logExistsFn': lambda x: len(x) == 0,
#		'outFilePostFix': ['cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'VEP annotation',
#		'desc': 'Annotate mutscan output',
#		'fun': vep_mutscan_batch.main,
#		'paramL': ([baseDir]),
#		'paramH': {},
#		'logPostFix': '.mutscan_vep.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['mutscan_vep_out.vcf'],
#		'clean': False,
#		'rerun': False
#		},

## join cosmic
#		{
#		'name': 'Join Cosmic',
#		'desc': 'Join annotated mutscan output with COSMIC',
#		'fun': annotate_join_cosmic_batch.main,
#		'paramL': (baseDir, '(.*)\.vep$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Example #26
0
#!/usr/bin/python

import sys, os, re
import mysetting, mybasic

mybasic.add_module_path(['NGS/pipeline','NGS/mutation'])
import mutect_batch, somaticindeldetector_batch

import mypipe

bamDirL = mysetting.wxsBamDirL
trioH = mypipe.read_trio('/EQL1/NSL/clinical/trio_info.txt', bamDirL)

#for tid in sorted(trioH.keys()):
#	if tid not in ['59','60','61']:
#		continue
#	print tid, trioH[tid]['prim_id'], trioH[tid]['recur_id']
#	for role in ['Normal','Primary','Recurrent']:
#		print role,trioH[tid][role]
#sys.exit(1)

outDir='/EQL3/pipeline/somatic_mutect'

## assume 1 primary & normal per trio
for tid in trioH:
	if trioH[tid]['norm_id'] == []:
		continue
	if tid not in ['63']:
		continue

	norm = trioH[tid]['norm_id'][0]
Example #27
0
#!/usr/bin/python

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH

mybasic.add_module_path(["Integration"])
import prepDB_mutscan, makeDB_mutation_rxsq


def post_s_rsq2mut(baseDir, server="smc1", dbN="ihlee_test"):
    sampN = baseDir.split("/")[-1]
    sid = sampN[:-4].replace(".", "_").replace("-", "_")
    print sampN, sid

    cosmicDatFileN = "%s/%s_splice_cosmic.dat" % (baseDir, sampN)
    if dbN in ["ihlee_test", "ircr1"]:
        datFileN = "/EQL1/NSL/RNASeq/results/mutation/%s.dat" % sampN
    else:
        datFileN = "%s/%s.dat" % (baseDir, sampN)
    if os.path.isfile(cosmicDatFileN):
        prepDB_mutscan.main(sampNamePat=("(.*)_(RSq)", ""), geneList=[], inFileN=cosmicDatFileN, outFileN=datFileN)

        ## import
        (con, cursor) = mymysql.connectDB(
            user=mysqlH[server]["user"], passwd=mysqlH[server]["passwd"], db=dbN, host=mysqlH[server]["host"]
        )
        cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid)
        cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE mutation_rsq' % datFileN)
        ## make sure to update sample_tag that this sample has RNA-Seq
        cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
Example #28
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch ## MODULES
	import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch

	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'BWA',
		'desc': 'fq -> sam -> bam -> sorted.bam',
		'fun': bwa_batch.align,
		'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True),
		'paramH': {},
		'logPostFix': '.bwa.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': True,
		'rerun': False
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['dedup.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Realign',
		'desc': 'dedup.bam -> realign.bam -> recal.bam',
		'fun': realign_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['recal.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MuTect',
		'desc': 'recal.bam -> .vcf',
		'fun': mutect_batch.mutect_PON,
		'paramL': (baseDir, genome, server, False),
		'paramH': {},
		'logPostFix': '.mutect_single.log',
		'logExistsFn': lambda x: 'done' in x[-9],
		'outFilePostFix': ['.mutect.vcf','.mutect_filter.vcf','.mutect'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'SomaticIndelDetector',
		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
		'fun': somaticindeldetector_batch.single_mode,
		'paramL': (baseDir, baseDir, 'CS', genome, server, False),
		'paramH': {},
		'logPostFix': '.somaticindeldetector.log',
		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
		'clean': False,
		'rerun': False
		},

#		{ ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?)
#		'name': 'VEP',
#		'desc': '.vcf -> .dat',
#		'fun': vep_mutect_batch.main,
#		'paramL': ([baseDir], False),
#		'paramH': {},
#		'logPostFix': 'mutect_vep.log',
#		'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['_vep.dat'],
#		'clean': False,
#		'rerun': False
#		}

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Example #29
0
#!/usr/bin/python
## integration into DB (per sample)

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['NGS/expression', 'Integration'])
import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene


def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'):
    sampN = baseDir.split('/')[-1]
    sid = sampN[:-4].replace('-', '_').replace('.', '_')  ##drop '_RSq'

    if dbN in ['ihlee_test', 'ircr1']:
        gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN
        datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN
    else:
        gctFileN = '%s/%s.gct' % (baseDir, sampN)
        datFileN = '%s/%s.dat' % (baseDir, sampN)
    print sampN, gctFileN
    rpkm_process.rpkm_process(inputDirN=baseDir,
                              filePattern='*.rpkm',
                              sampRegex='(.*)_RSq\.rpkm',
                              outputFileN=gctFileN)
    ## prep
    prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN,
                               geneList=[],
                               samplePrefix='',
                               outDatFileName=datFileN)
    ## import
Example #30
0
	annotH = {}
	for line in inFile:
		colL = line.rstrip().split('\t')
		rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']])
		(chr,chrSta,chrEnd) = rm.groups()
		ref = colL[idxH['ref']]
		alt = colL[idxH['alt']]
		if (chr,chrSta,chrEnd,ref,alt) not in annotH:
			annotH[(chr,chrSta,chrEnd,ref,alt)] = {}
			for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']:
				annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]]
	return annotH

### until it is merged into pipeline
import mybasic
mybasic.add_module_path(['NGS/pipeline'])
import mypipe
trioH = mypipe.read_trio(bamDirL=mysetting.wxsBamDirL)
pairH = {}
for tid in trioH:
	if trioH[tid]['recur_id'] != []:
		pid = trioH[tid]['prim_id'][0][:-5]
		pairH[pid] = map(lambda x: x[:-5], trioH[tid]['recur_id'])
####
#(con,cursor) = mymysql.connectDB(db='ircr1')
#tag = 'pair_R:%'
#cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag)
#sIdL_p = [x for (x,) in cursor.fetchall()]
#
#tag = 'XSeq%%,N'
#cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag)
Example #31
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/mutation', 'NGS/loh', 'NGS/purity'])
    import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch  ## MODULES

    return [  ## PARAMETERS
        {
            'name': 'MutScan for the tumor sample',
            'desc': 'pileup_proc -> loh.mutscan',
            'fun': mutScan_loh_batch.main,
            'paramL': (baseDir, baseDir, False, 10, 0, 0),
            'paramH': {},
            'logPostFix': '.loh.mutscan.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['loh.mutscan'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'delta B-allele frequencies calculation',
            'desc':
            'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample',
            'fun': delta_baf_mutscan_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dbaf.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['dbaf.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'delta BAF segmentation',
            'desc': 'segment delta BAF',
            'fun': delta_baf_seg_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dbaf.seg.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Analyzing' in x[-1],
            'outFilePostFix': ['seg'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Plotting',
            'desc': 'Generate deltaBAF/CN trajectory plot',
            'fun': dbaf_cn_plot_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.traj_plot.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1],
            'outFilePostFix': ['dBAF_CNA_traj.pdf'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'CNLOH/LOH determination',
            'desc':
            'calculate average copy number of LOH segments to determine CNLOH/LOH',
            'fun': calcCN_LOH_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.loh_cn.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Setting' in x[-1],
            'outFilePostFix': ['loh_cn.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'gene LOH',
            'desc': 'loh_cn.txt -> loh_gene.dat',
            'fun': loh2gene_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.refFlatH[server][genome]),
            'paramH': {},
            'logPostFix': '.loh_gene.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['loh_gene.dat'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Normal contamiation calculation',
            'desc':
            'calculate normal contamination levels at heterozygous germline SNPs in LOH regions',
            'fun': calcNormalF_loh_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.nfrac_all.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['nFrac_all.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Tumor fraction estimation',
            'desc': 'estimate tumor fraction',
            'fun': peakFrac_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.tfrac.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1],
            'outFilePostFix': ['tumor_frac.txt'],
            'clean': False,
            'rerun': False
        },
    ]
Example #32
0
#!/usr/bin/python

import sys, os, re, getopt, glob

import mybasic
mybasic.add_module_path(['NGS/align','NGS/mutation'])

import mybasic, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch

def main(baseDir, projectName):
	
	current_files_list = []
	compared_files_list = []	
	current_files_list = glob.glob(baseDir+'/*')

	outDir = baseDir + '/pileup_proc'
	
	# compose log string
	html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

	# prep html file
	html_path = '/var/www/html/pipeline_logs/' + projectName + '/'
	file_name_split = baseDir.split('/S')
	sample_name = 'S' + file_name_split[1]
	file_name = 'pipeline2_log_' + sample_name + '.html'
	# create .html file
	with open(os.path.join(html_path, file_name), 'wb') as log_file:
		log_file.write(html_head_string)
	log_file.close()

	# change mod and open log_file again
Example #33
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/align','NGS/fastq','NGS/coverage','NGS/expression'])
	import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch ## MODULES
	import fastqc_batch
	
	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Trim',
		'desc': 'fq.gz -> trim -> fq',
		'fun': trim_batch.trim_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30),
		'paramH': {},
		'logPostFix': '.trim.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Mapping',
		'desc': 'fq -> bam',
		'fun': gsnap_sam_batch.align,
		'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)),
		'paramH': {},
		'logPostFix': '.gsnap.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
		'outFilePostFix': ['bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Formet Conversion and sorting',
		'desc': 'bam -> sort -> sorted.bed',
		'fun': bam2sortedBed_batch.sam2bed_batch,
		'paramL': (baseDir, baseDir, '', False),
		'paramH': {},
		'logPostFix': '.sorted.bed.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['sorted.bed'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'TDFgen',
		'desc': 'sorted.bed -> bedgraph -> tdf',
		'fun': sortedBed2tdf_batch.main,
		'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' % (mysetting.ucscSeqDir[server][genome], genome), genome),
		'paramH': {},
		'logPostFix': '.tdf.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-9],
		'outFilePostFix': ['bedgraph','tdf'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RPKMgen',
		'desc': 'sorted.bed -> rpkm',
		'fun': degSeq_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], False),
		'paramH': {},
		'logPostFix': '.degSeq.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['rpkm'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Example #34
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(
        ['NGS/align', 'NGS/fastq', 'NGS/coverage', 'NGS/expression'])
    import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch  ## MODULES
    import fastqc_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Trim',
            'desc': 'fq.gz -> trim -> fq',
            'fun': trim_batch.trim_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30),
            'paramH': {},
            'logPostFix': '.trim.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Mapping',
            'desc': 'fq -> bam',
            'fun': gsnap_sam_batch.align,
            'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)),
            'paramH': {},
            'logPostFix': '.gsnap.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1],
            'outFilePostFix': ['bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Formet Conversion and sorting',
            'desc': 'bam -> sort -> sorted.bed',
            'fun': bam2sortedBed_batch.sam2bed_batch,
            'paramL': (baseDir, baseDir, '', False),
            'paramH': {},
            'logPostFix': '.sorted.bed.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['sorted.bed'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'TDFgen',
            'desc':
            'sorted.bed -> bedgraph -> tdf',
            'fun':
            sortedBed2tdf_batch.main,
            'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' %
                       (mysetting.ucscSeqDir[server][genome], genome), genome),
            'paramH': {},
            'logPostFix':
            '.tdf.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Done' in x[-9],
            'outFilePostFix': ['bedgraph', 'tdf'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'RPKMgen',
            'desc': 'sorted.bed -> rpkm',
            'fun': degSeq_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.refFlatH[server][genome], False),
            'paramH': {},
            'logPostFix': '.degSeq.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'ZZZ3' in x[-1],
            'outFilePostFix': ['rpkm'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Example #35
0
#!/usr/bin/python

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['Integration'])
import prepDB_mutscan, makeDB_mutation_rxsq


def post_s_rsq2mut(baseDir, server='smc1', dbN='ihlee_test'):
    sampN = baseDir.split('/')[-1]
    sid = sampN[:-4].replace('.', '_').replace('-', '_')
    print sampN, sid

    cosmicDatFileN = '%s/%s_splice_cosmic.dat' % (baseDir, sampN)
    if dbN in ['ihlee_test', 'ircr1']:
        datFileN = '/EQL1/NSL/RNASeq/results/mutation/%s.dat' % sampN
    else:
        datFileN = '%s/%s.dat' % (baseDir, sampN)
    if os.path.isfile(cosmicDatFileN):
        prepDB_mutscan.main(sampNamePat=('(.*)_(RSq)', ''),
                            geneList=[],
                            inFileN=cosmicDatFileN,
                            outFileN=datFileN)

        ## import
        (con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],
                                          passwd=mysqlH[server]['passwd'],
                                          db=dbN,
                                          host=mysqlH[server]['host'])
        cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid)
Example #36
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch ## MODULES

	specL = [ ## PARAMETERS
		{
		'name': 'Align',
		'desc': '.fq.gz -> .bam',
		'fun': gsnap_splice_bam_batch.align,
		'paramL': (baseDir, baseDir, False, genome),
		'paramH': {},
		'logPostFix': '.gsnap.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
		'outFilePostFix': ['splice.bam'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'Sort',
		'desc': 'bam -> sorted.bam',
		'fun': gsnap_splice_bam_sort_batch.main,
		'paramL': (baseDir, baseDir, 10000000000),
		'paramH': {},
		'logPostFix': '_splice.sort.qlog',
		'logExistsFn': lambda x: len(x)<1 or 'merging' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '_splice.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['dedup.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RealignTarget',
		'desc': 'dedup.bam -> realigner.intervals -> realigner_ft.intervals',
		'fun': realignTargetFilter_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '_splice.interval.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['realigner.intervals','realigner_ft.intervals'],
		'clean': False,
		'rerun': False
		},
	
		{
		'name': 'Realign/Recalibrate',
		'desc': 'dedup.bam -> realign.bam -> recal.bam',
		'fun': realignWithFtTarget_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '_splice.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['realign.bam', 'recal.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'UnifiedGenotype',
		'desc': 'recal.bam -> vcf',
		'fun': unifiedGeno_batch.main,
		'paramL': (baseDir, baseDir, server, genome, False),
		'paramH': {},
		'logPostFix': '_splice.gatk.log',
		'logExistsFn': lambda x: len(x)>0 and any(s for s in x[-10:] if 'Total runtime' in s),
		'outFilePostFix': ['vcf'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'MutScan',
#		'desc': 'vcf -> mutscan',
#		'fun': vcf2mutScan_batch.main,
#		'paramL': (baseDir, baseDir, False),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_splice.mutscan'],
#		'clean': False,
#		'rerun': False
#		},

### annotate mutscan using VEP
#		{
#		'name': 'VEP annotation',
#		'desc': 'Annotate mutscan output',
#		'fun': annotate_mutscan_batch.annotate_mutscan_batch,
#		'paramL': (baseDir, '(.*)\.mutscan$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.vep.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['vep'],
#		'clean': False,
#		'rerun': False
#		},

## join cosmic
#		{
#		'name': 'Join Cosmic',
#		'desc': 'Join annotated mutscan output with COSMIC',
#		'fun': annotate_join_cosmic_batch.main,
#		'paramL': (baseDir, '(.*)\.vep$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},

#		{ ## old joinCosmic
#		'name': 'JoinCosmic',
#		'desc': 'mutscan -> cosmic.dat',
#		'fun': mutscan_snp_cosmic_batch.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': '_splice.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['dat'],
#		'clean': False,
#		'rerun': False 
#		},

##		{
##		'name': 'Cleanup',
##		'desc': 'remove all, but logs and designated result file',
##		'fun': cleanup.main,
##		'paramL': (baseDir,),
##		'paramH': {},
##		'logPostFix': 'cleanup.qlog',
##		'logExistsFn': lambda x: False,
##		'outFilePostFix': ['pileup']
##		},

		]

#	if server == 'smc2':
#		return specL[-1]
#	else:
#		return specL
	return specL
Example #37
0
def genSpec_CS(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(
        ['NGS/coverage', 'NGS/expression', 'NGS/copynumber'])
    import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch

    return [  ## PARAMTERS
        {
            'name': 'Format Conversion and sorting',
            'desc': 'bam -> sort -> sorted.bed',
            'fun': bam2sortedBed_batch.sam2bed_batch,
            'paramL': (baseDir, baseDir, 'recal', False),
            'paramH': {},
            'logPostFix': '.sorted.bed.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['sorted.bed'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'RPKMgen',
            'desc':
            'sorted.bed -> rpkm',
            'fun':
            degSeq_batch.main,
            'paramL':
            (baseDir, baseDir,
             '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False),
            'paramH': {},
            'logPostFix':
            '.degSeq.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'omitted' in x[-1],
            'outFilePostFix': ['rpkm'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Calculate a log2 rpkm ratio for all exons',
            'desc': 'log2(tumor rpkm/normal rpkm',
            'fun': rpkm2cn_batch.main_pool,
            'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False),
            'paramH': {},
            'logPostFix': '.cn.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['copynumber'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Calculate gene copy number from log2 rpkm ratios',
            'desc':
            'copynumber -> cn_gene.dat',
            'fun':
            exon2gene_batch.main,
            'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],
                       mysetting.cs_gene, False),
            'paramH': {},
            'logPostFix':
            '.cn_gene.log',
            'logExistsFn':
            lambda x: len(x) > 0 and 'VHL' in x[-1],
            'outFilePostFix': ['cn_gene.dat'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Draw Plot',
            'desc': 'seg->plot',
            #		'fun' : drawCNATraj_batch.batch,
            'fun': drawCNATraj_batch.draw_single,
            'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome),
            'paramH': {},
            'logPostFix': '',
            'logExistsFn': lambda x: True,
            'outFilePostFix': [],
            'clean': False,
            'rerun': False
        },
    ]
Example #38
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/fusion'])
    import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False),
        #		'paramH': {},
        #		'logPostFix': '.gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        {
            'name': 'Filter transloc',
            'desc': 'splice.gsnap.gz -> splice_transloc.gsnap',
            'fun': fusion_filter_transloc_batch.fusion_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.ft_tloc.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_transloc.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'annotate',
            'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap',
            'fun': fusion_filter_annot1_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.annot.qlog',
            'logExistsFn': lambda x: len(x) > 1 and 'Results' in x[-1],
            'outFilePostFix': ['splice_transloc_annot1.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'sort',
            'desc':
            'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt',
            'fun':
            fusion_proc_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix':
            '.sort.qlog',
            'logExistsFn':
            lambda x: len(x) == 0,
            'outFilePostFix': [
                'splice_transloc_annot1.sorted.gsnap',
                'splice_transloc_annot1.report.txt'
            ],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'annotate report',
            'desc': 'report.txt -> report_annot.txt',
            'fun': fusion_proc_annot_batch.fusion_proc_annot_batch,
            'paramL': (baseDir, baseDir, None, False),
            'paramH': {},
            'logPostFix': '.report_annot.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'Summarize',
        #		'desc': '',
        #		'fun': ,
        #		'paramL': (baseDir, baseDir, False),
        #		'paramH': {},
        #		'logPostFix': 'realign.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
        #		'outFilePostFix': ['realign.bam', 'recal.bam'],
        #		'clean': False,
        #		'rerun': False
        #		},
    ]
Example #39
0
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/copynumber'])
    import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch

    return [  ## PARAMETERS
        {
            'name': 'run ngCGH for pairs of bam',
            'desc': 'bam -> .ngCGH',
            'fun': ngCGH_batch.main,
            'paramL': (baseDir, baseDir, 1000, False),
            'paramH': {},
            'logPostFix': '.cn_ngCGH.log',
            'logExistsFn': lambda x: len(x) > 0 and 'finalizers' in x[-1],
            'outFilePostFix': ['ngCGH'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Segmenation',
            'desc': 'ngCGH -> seg',
            'fun': cgh2seg_batch.cgh2seg,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.seg.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Centrality parameter' in x[-1],
            'outFilePostFix': ['ngCGH.seg'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Calculate gene copy number from segments',
            'desc':
            'seg -> cn_gene.dat',
            'fun':
            seg2gene_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.refFlatH[server][genome], [], False),
            'paramH': {},
            'logPostFix':
            '.cn_gene.log',
            'logExistsFn':
            lambda x: len(x) > 0 and 'ZZZ3' in x[-1],
            'outFilePostFix': ['cn_gene.dat'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Draw Plot',
            'desc': 'seg->plot',
            #		'fun' : drawCNATraj_batch.batch,
            'fun': drawCNATraj_batch.draw_single,
            'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome),
            'paramH': {},
            'logPostFix': '',
            'logExistsFn': lambda x: True,
            'outFilePostFix': [],
            'clean': False,
            'rerun': False
        },
    ]
Example #40
0
#!/usr/bin/python

import sys, os, re, getopt, glob
import mybasic

mybasic.add_module_path(['NGS/align', 'NGS/mutation'])

import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch


def wxs_seq(baseDir, projectName):

    current_files_list = []
    compared_files_list = []
    current_files_list = glob.glob(baseDir + '/*')

    # compose log string
    html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

    # prep html file
    html_path = '/var/www/html/pipeline_logs/' + projectName + '/'
    file_name_split = baseDir.split('/S')
    sample_name = 'S' + file_name_split[1]
    file_name = 'pipeline1_log_' + sample_name + '.html'
    # create .html file
    with open(os.path.join(html_path, file_name), 'wb') as log_file:
        log_file.write(html_head_string)
    log_file.close()

    # change mode and open log_file again
    os.system('chmod 755 %s%s' % (html_path, file_name))
Example #41
0
#!/usr/bin/python

from glob import glob
import sys,os,re
import mysetting, mymysql, mypipe, mybasic
mybasic.add_module_path(['Integration','NGS/mutation'])
import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch

def prep_single(outFileN, server='smc1', dbN='ircr1'):
	(con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])
	cosmicL = []
	for dir in mysetting.wxsMutscanDirL:
		cosmicL += filter(lambda x: '_B_' not in x, glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir))

	cursor.execute('SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"')
	results = cursor.fetchall()
	sidL = []
	for res in results:
		sidL.append(res[0])
	for cosmic in cosmicL:
		(sid, postfix, platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat', os.path.basename(cosmic)).groups()
		if postfix not in ['T', 'RSq']:
			sid = '%s_%s' % (sid, postfix)
		if sid not in sidL:
			print sid, cosmic
			tag = 'XSeq_%s' % platform
			cursor.execute('INSERT INTO sample_tag SET samp_id="%s", tag="%s"' % (sid, tag))

	cmd = 'cat %s | /usr/bin/python %s/Integration/prepDB_mutscan.py > %s' % (' '.join(cosmicL), mysetting.SRC_HOME, outFileN)
	os.system(cmd)
Example #42
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch  ## MODULES

    specL = [  ## PARAMETERS
        {
            'name': 'Align',
            'desc': '.fq.gz -> .bam',
            'fun': gsnap_splice_bam_batch.align,
            'paramL': (baseDir, baseDir, False, genome),
            'paramH': {},
            'logPostFix': '.gsnap.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1],
            'outFilePostFix': ['splice.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Sort',
            'desc': 'bam -> sorted.bam',
            'fun': gsnap_splice_bam_sort_batch.main,
            'paramL': (baseDir, baseDir, 10000000000),
            'paramH': {},
            'logPostFix': '_splice.sort.qlog',
            'logExistsFn': lambda x: len(x) < 1 or 'merging' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '_splice.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['dedup.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'RealignTarget',
            'desc':
            'dedup.bam -> realigner.intervals -> realigner_ft.intervals',
            'fun':
            realignTargetFilter_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '_splice.interval.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix':
            ['realigner.intervals', 'realigner_ft.intervals'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name':
            'Realign/Recalibrate',
            'desc':
            'dedup.bam -> realign.bam -> recal.bam',
            'fun':
            realignWithFtTarget_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '_splice.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['realign.bam', 'recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name':
            'UnifiedGenotype',
            'desc':
            'recal.bam -> vcf',
            'fun':
            unifiedGeno_batch.main,
            'paramL': (baseDir, baseDir, server, genome, False),
            'paramH': {},
            'logPostFix':
            '_splice.gatk.log',
            'logExistsFn':
            lambda x: len(x) > 0 and any(s for s in x[-10:]
                                         if 'Total runtime' in s),
            'outFilePostFix': ['vcf'],
            'clean':
            False,
            'rerun':
            False
        },

        #		{
        #		'name': 'MutScan',
        #		'desc': 'vcf -> mutscan',
        #		'fun': vcf2mutScan_batch.main,
        #		'paramL': (baseDir, baseDir, False),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_splice.mutscan'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ### annotate mutscan using VEP
        #		{
        #		'name': 'VEP annotation',
        #		'desc': 'Annotate mutscan output',
        #		'fun': annotate_mutscan_batch.annotate_mutscan_batch,
        #		'paramL': (baseDir, '(.*)\.mutscan$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.vep.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['vep'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ## join cosmic
        #		{
        #		'name': 'Join Cosmic',
        #		'desc': 'Join annotated mutscan output with COSMIC',
        #		'fun': annotate_join_cosmic_batch.main,
        #		'paramL': (baseDir, '(.*)\.vep$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{ ## old joinCosmic
        #		'name': 'JoinCosmic',
        #		'desc': 'mutscan -> cosmic.dat',
        #		'fun': mutscan_snp_cosmic_batch.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': '_splice.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ##		{
        ##		'name': 'Cleanup',
        ##		'desc': 'remove all, but logs and designated result file',
        ##		'fun': cleanup.main,
        ##		'paramL': (baseDir,),
        ##		'paramH': {},
        ##		'logPostFix': 'cleanup.qlog',
        ##		'logExistsFn': lambda x: False,
        ##		'outFilePostFix': ['pileup']
        ##		},
    ]

    #	if server == 'smc2':
    #		return specL[-1]
    #	else:
    #		return specL
    return specL
Example #43
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/skipping'])
    import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False),
        #		'paramH': {},
        #		'logPostFix': 'gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Filter exonskip',
            'desc': 'splice.gsnap.gz -> splice_exonSkip.gsnap',
            'fun': exonSkip_filter_batch.exonSkip_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.exonSkip.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_exonSkip.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Filter normal exonskip',
            'desc': 'splice.gsnap -> splice_exonSkip_normal.gsnap.gz',
            'fun': exonSkip_filter_normal_batch.exonSkip_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.exonSkip_normal.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_exonSkip_normal.gsnap.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'sort',
            'desc': 'splice_exonSkip.gsnap -> splice_exonSkip_report.txt',
            'fun': exonSkip_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.sort.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_report.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'sort-normal',
            'desc':
            'splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt',
            'fun': exonSkip_normal_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.sort_normal.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'annotate report',
            'desc': 'report.txt -> report_annot.txt',
            'fun': exonSkip_proc_annot_batch.exonSkip_proc_annot_batch,
            'paramL': (baseDir, baseDir, None, False),
            'paramH': {},
            'logPostFix': '.skip_annot.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'link',
        #		'desc': 'put all report_annot.txt files in a directory',
        #		'fun': exonSkip_link.link,
        #		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'),
        #		'paramH': {},
        #		'logPostFix': 'link.qlog',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'link-normal',
        #		'desc': 'put all report_normal.txt files in a directory',
        #		'fun': exonSkip_link_normal.link,
        #		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'),
        #		'paramH': {},
        #		'logPostFix': 'link_normal.qlog',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
        #		'clean': False,
        #		'rerun': False
        #		},
    ]