def buildSelectStatementfromPed(filter_type, pedfile, template): '''Build a select statement from a template and a pedigree file''' pedigree = csv.DictReader( IOTools.openFile(pedfile), delimiter='\t', fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status']) affecteds = [] unaffecteds = [] parents = [] select = None # loop over pedigree file and establish relationships for row in pedigree: if row['status'] == '2': if filter_type == "denovo": father = row['father'] mother = row['mother'] proband = row['sample'] elif filter_type == "dominant" or filter_type == "recessive": affecteds += [row['sample']] if filter_type == "recessive": parents += [row['father'], row['mother']] if row['status'] == '1': if filter_type == "dominant": unaffecteds += [row['sample']] elif filter_type == "recessive": if row['sample'] not in parents: unaffecteds += [row['sample']] # Build select statement from template if filter_type == "denovo": select = template.replace("father", father) select = select.replace("mother", mother) select = select.replace("proband", proband) elif filter_type == "dominant": affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds) if len(unaffecteds) == 0: unaffecteds_exp = '' else: unaffecteds_exp = '&&vc.getGenotype("' + \ ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \ '").isHomRef()' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) elif filter_type == "recessive": affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds) unaffecteds_exp = '").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds) if len(parents) == 0: parents_exp = '' else: parents_exp = '&&vc.getGenotype("' + \ ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \ '").getPL().1==0' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) select = select.replace("parents_exp", parents_exp) return select
def iterateMacs2Peaks(infile): '''iterate over peaks.xls file and return parsed data. pvalues and fdr are converted to values between 0 and 1 from their -log10 values. ''' for row in CSV.DictReader(infile, dialect='excel-tab'): # these are 1-based coordinates # macs can have negative start coordinates # start try: yield Macs2Peak._make( (row['chr'], max(int(row['start']) - 1, 0), int(row['end']), int(row['length']), float(row['pileup']), math.pow(10, -float(row['-log10(pvalue)'])), float(row['fold_enrichment']), math.pow(10, -float(row['-log10(qvalue)'])), row['name'])) except KeyError, msg: raise KeyError("%s: %s" % (msg, row))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError, msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x['strand'], int( x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [ infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz" ] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals()) ########################################## ########################################## ########################################## ## estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''') # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''' ) utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''') scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \ ( len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript)) ) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''') fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''') fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''') fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \ (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R('''par(mfrow=c(3,1))''') R('''x=seq(0,1,0.02)''') R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''' ) R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''' ) R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''' ) R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''' ) R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''') E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1)) # remove na obs = R('''obs = obs[!is.na(obs)]''') if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.getOutputFile("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.getOutputFile("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.getOutputFile("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.getOutputFile("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.getOutputFile("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.getOutputFile("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.Stop()
cc.close() quick_import_statement = \ "sqlite3 -header -csv -separator '\t' %s '.import %%s %s'" % \ (options.database, options.tablename) if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = CSV.DictReader(infile, dialect=options.dialect, fieldnames=options.header) if options.replace_header: reader.next() E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" % row) try: rows.append(CSV.ConvertDictionary(row, map=options.map))
('none', 'all' ), ('kappa', 'all' ), ('omega', 'all' ), ('ds', 'all'), ) map_model2params = { 'none' : 8, 'ds' : 7, 'omega' : 6, 'kappa' : 6, 'omega-ds' : 5, 'kappa-ds' : 5, 'all' : 4 } reader = CSV.DictReader( sys.stdin, dialect=options.csv_dialect ) stats = {} options.stdout.write( "id" ) for a, b in tests: options.stdout.write( "\t%s:%s\tp%s:%s" % (a, b, a, b)) stats[(a,b)] = 0 options.stdout.write( "\n" ) ninput, noutput, nskipped, nerrors, ntests = 0, 0, 0, 0, 0 for row in reader: ninput += 1 if int(row['N:len']) <= options.min_length or int(row['C:len']) <= options.min_length :