def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) IOTools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def _output(section, subsection, valuef, dtype): # fold change matrix matrix, row_headers = buildMatrix(results, valuef=valuef, dtype=dtype) outfile = getFileName(options, go=test_ontology, section=section, set='%s_all' % subsection) IOTools.writeMatrix(outfile, matrix, row_headers, col_headers, row_header="category") outfile = getFileName(options, go=test_ontology, section=section, set='%s_alldesc' % subsection) IOTools.writeMatrix( outfile, matrix, ["%s:%s" % (x, go2info[x].mDescription) for x in row_headers], col_headers, row_header="category")
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run(statement)
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run(statement)
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.open_file(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.open_file(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHSMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run(statement)
def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.open_file(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.open_file(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = IOTools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = IOTools.openFile(infile) header = inf.readline() outf = IOTools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()
def CleanVariantTables(genes, variants, cols, outfile): variants = pd.read_csv(variants, sep="\t") variants = variants.drop(0) vp1 = copy.copy( variants[['CHROM', 'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']]) alleles = vp1['REF1'].str.cat(vp1['ALT'].str.strip(), sep=",").str.split(",") vp1['GT'] = vp1['GT'].str.replace(".", "0") inds1 = vp1['GT'].str.get(0).astype(int).values inds2 = vp1['GT'].str.get(-1).astype(int).values x = 0 a1s = [] a2s = [] gts = [] homhet = [] for allele in alleles: i1 = int(inds1[x]) i2 = int(inds2[x]) a1 = allele[i1] a2 = allele[i2] a1s.append(a1) a2s.append(a2) if a1 == a2: homhet.append("HOM") else: homhet.append("HET") gts.append("%s%s" % (a1, a2)) x += 1 vp1['HOMHET'] = homhet vp1['Allele1'] = a1s vp1['Allele2'] = a2s vp1['Genotype'] = gts vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1) vp1[cols] = copy.copy(variants[cols]) Ls = [] for gene in [ line.strip() for line in IOTools.open_file(genes[0]).readlines() ]: cp = [] with IOTools.open_file(genes[1]) as infile: for line in infile: r = re.search(gene, line) if r: line = line.strip().split("\t") chrom = line[0] pos = line[1] cp.append("%s_%s" % (chrom, pos)) cp = set(cp) for c in cp: Ls.append((gene, c.split("_"))) df = pd.DataFrame(Ls) df['CHROM'] = df[1].str.get(0) df['POS'] = df[1].str.get(1) df = df.drop(1, 1) df.columns = ['gene', 'CHROM', 'POS'] variants = vp1.merge(df, 'left') variants.to_csv(outfile, sep="\t")
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): IOTools.touch_file(outfile) return load_statement = P.build_load_statement(tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run(statement)
def convertGo2Goslim(options): """read gene list with GO assignments and convert to GO slim categories.""" E.info("reading GO assignments from stdin") gene2gos, go2infos = ReadGene2GOFromFile(options.stdin) input_genes, input_goids = countGOs(gene2gos) ############################################################# # read GO ontology from file assert options.filename_ontology, "please supply a GO ontology" E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.openFile(options.filename_ontology) ontology = readOntology(infile) infile.close() go2infos = collections.defaultdict(dict) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) E.info("reading GO assignments from %s" % options.filename_slims) go_slims = GetGOSlims(IOTools.openFile(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) E.info("read go slims from %s: go=%i, slim=%i" % (options.filename_slims, len(go_slims), len(v))) output_goids, output_genes = set(), set() noutput = 0 options.stdout.write("\t".join(("go_type", "gene_id", "go_id", "description", "evidence")) + "\n") for category, gene2go in sorted(gene2gos.items()): gene2go = MapGO2Slims(gene2go, go_slims, ontology) for gene_id, values in sorted(gene2go.items()): output_genes.add(gene_id) for go in sorted(values, key=lambda x: x.mGOId): output_goids.add(go.mGOId) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % ( go.mGOType, gene_id, go.mGOId, go.mDescription, "NA", )) noutput += 1 E.info(("ninput_genes=%i, ninput_goids=%i, noutput_gene=%i, " "noutput_goids=%i, noutput=%i") % (len(input_genes), len(input_goids), len(output_genes), len(output_goids), noutput))
def FilterFreqCols(infile, thresh, fcols): ''' Returns a set of line indices indicating lines where either of the alleles called have a frequency of less than thresh in all of the columns specified in fcols. No information - assigned allele frequency of -1. ''' fcols = fcols.split(",") # read the column headings from the variant table cols = IOTools.open_file(infile).readline().strip().split("\t") # store allele frequency columns AFdict = dict() # store low frequency indices nD = dict() for col in fcols: ind = cols.index(col) GT_i = cols.index('GT') n = 0 nlist = set() AFS = [] with IOTools.open_file(infile) as input: for line in input: if n > 1: line = line.strip().split("\t") GT = line[GT_i].replace(".", "0").split("/") af = line[ind].split(",") AF = [] # where the allele frequency is not numeric # "." or "NA" use -1 to indicate no data for a in af: try: AF.append(float(a)) except: AF.append(float(-1)) AF2 = [l if l > 0 else 0 for l in AF] AF = np.array(AF) AF = np.insert(AF, 0, 1 - sum(AF2)) GT[0] = int(GT[0]) GT[1] = int(GT[1]) # If the variant is not in database the column shows "." # but the site # may still have been called as multi allelic # - use -1 for all frequencies # in this case if max(GT[0], GT[1]) > (len(AF) - 1): AF = [float(-1)] * (max(GT[0], GT[1]) + 1) AF1 = AF[GT[0]] AF2 = AF[GT[1]] if AF1 >= thresh and AF2 >= thresh: nlist.add(n) AFS.append((AF1, AF2)) else: AFS.append(('NA', 'NA')) n += 1 AFdict[col] = AFS nD[col] = nlist ns = set.union(*list(nD.values())) return AFdict, ns
def config_to_dictionary(config): """convert the contents of a :py:class:`ConfigParser.ConfigParser` object to a dictionary This method works by iterating over all configuration values in a :py:class:`ConfigParser.ConfigParser` object and inserting values into a dictionary. Section names are prefixed using and underscore. Thus:: [sample] name=12 is entered as ``sample_name=12`` into the dictionary. The sections ``general`` and ``DEFAULT`` are treated specially in that both the prefixed and the unprefixed values are inserted: :: [general] genome=hg19 will be added as ``general_genome=hg19`` and ``genome=hg19``. Numbers will be automatically recognized as such and converted into integers or floats. Returns ------- config : dict A dictionary of configuration values """ p = defaultdict(lambda: defaultdict(TriggeredDefaultFactory())) for section in config.sections(): for key, value in config.items(section): try: v = IOTools.str2val(value) except TypeError: E.error("error converting key %s, value %s" % (key, value)) E.error("Possible multiple concurrent attempts to " "read configuration") raise p["%s_%s" % (section, key)] = v # IMS: new heirarchical format try: p[section][key] = v except TypeError: # fails with things like genome_dir=abc # if [genome] does not exist. continue if section in ("general", "DEFAULT"): p["%s" % (key)] = v for key, value in config.defaults().items(): p["%s" % (key)] = IOTools.str2val(value) return p
def runGOFromDatabase(outfile, outdir, statement_fg, statement_bg, go_file, ontology_file=None, samples=1000): """check for GO enrichment. Gene lists are extracted from a database. This method is a wrapper for `runGO.py`. Arguments --------- outfile : string Output filename outdir : string Output directory for auxiliary files statement_fg : string SQL statement to select genes of foreground set. statement_bg : string SQL statement to select genes in background set. go_file : string Filename with Gene-to-GO assignments ontology_file : string Filename with ontology information. samples : int Number of samples for empirical FDR. If not given, use BH FDR. """ dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() fg = set([x[0] for x in cc.execute(statement_fg).fetchall()]) bg = set([x[0] for x in cc.execute(statement_bg).fetchall()]) if len(fg) == 0: IOTools.touch_file(outfile) return fg_file = os.path.join(outdir, "foreground") bg_file = os.path.join(outdir, "background") outf = open(fg_file, "w") outf.write("\n".join(map(str, fg)) + "\n") outf.close() outf = open(bg_file, "w") outf.write("\n".join(map(str, bg)) + "\n") outf.close() runGOFromFiles(outfile, outdir, fg_file, bg_file, go_file, ontology_file=ontology_file, samples=samples)
def realignMatchedSample(infile, outfile): ''' repeat realignments with merged bam of control and tumor this should help avoid problems with sample-specific realignments''' genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.GATKIndelRealign(infile, outfile, genome) IOTools.zap_file(infile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-a", "--first-fastq-file", dest="fastq1", type="string", help="supply read1 fastq file") parser.add_option("-b", "--second-fastq-file", dest="fastq2", type="string", help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if args and len(args) == 2: options.fastq1, options.fastq2 = args fastq1 = IOTools.open_file(options.fastq1) fastq2 = IOTools.open_file(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--output-section", dest="output", type="choice", choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError("two arguments required") if args[0] == "-": infile1 = options.stdin else: infile1 = IOTools.open_file(args[0], "r") infile2 = IOTools.open_file(args[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = options.output outfile = options.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.stop()
def makeCpgIslandsBed(outfile): infile = PARAMS["methylation_summary_cpgislands"] out = IOTools.openFile(outfile, "w") with IOTools.openFile(infile, "r") as f: for line in f.readlines(): # this assumes location of req. values contig, start, end = line.split()[1:4] if not contig == "chrom": out.write("%s\t%s\t%s\n" % (contig, start, end)) out.close()
def __call__(self, track, slice=None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
def test_touch_file_updates_existing_file(self): with IOTools.open_file(self.filename, "w") as outf: outf.write("some data\n") created = os.stat(self.filename).st_mtime time.sleep(1) IOTools.touch_file(self.filename) modified = os.stat(self.filename).st_mtime self.assertGreater(modified, created) with IOTools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(data, "some data\n")
def test_touch_file_creates_empty_file(self): self.assertFalse(os.path.exists(self.filename)) IOTools.touch_file(self.filename) self.assertTrue(os.path.exists(self.filename)) if self.filename.endswith(".gz"): self.assertFalse(IOTools.is_empty(self.filename)) else: self.assertTrue(IOTools.is_empty(self.filename)) with IOTools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(len(data), 0)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.get_temp_dir(shared=True) outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''picard AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''picard AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s;''' statement += "samtools index %(outfile)s; " statement += "rm -rf %(tmpdir_gatk)s ;" P.run(statement) IOTools.zap_file(infile) IOTools.zap_file(infile_tumor)
def summarizeFastqScreen(infiles, outfiles): all_files = [] for infile in infiles: all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt")) if len(all_files) == 0: E.warn("no fastqcscreen results to concatenate") for x in outfiles: IOTools.touch_file(x) return df_summary, df_details = PipelineReadqc.read_fastq_screen( all_files) df_summary.to_csv(outfiles[0], sep="\t", index=True) df_details.to_csv(outfiles[1], sep="\t", index=True)
def filterDamage(infile, damagestr, outfiles): ''' Filter variants which have not been assessed as damaging by any of the specified tools. Tools and thresholds can be specified in the pipeline.yml. Does not account for multiple alt alleles - if any ALT allele has been assessed as damaging with any tool the variant is kept, regardless of if this is the allele called in the sample. ''' damaging = damagestr.split(",") cols = IOTools.open_file(infile).readline().strip().split("\t") D = dict() # parses the "damage string" from the pipeline.yml # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1... # where variants with any of these results in this column will # be retained for d in damaging: d = d.split("|") col = d[0] res = d[1].split("-") i = cols.index(col) D[col] = ((res, i)) x = 0 out = IOTools.open_file(outfiles[0], "w") out2 = IOTools.open_file(outfiles[1], "w") with IOTools.open_file(infile) as input: for line in input: if x > 1: # grep for specific strings within this column of this # line of the input file line = line.strip().split("\t") isdamaging = 0 for key in D: res, i = D[key] current = line[i] for r in res: if re.search(r, current): isdamaging = 1 if isdamaging == 1: out.write("%s\n" % "\t".join(line)) else: out2.write("%s\n" % "\t".join(line)) else: out.write(line) x += 1 out.close() out2.close()
def loadManualAnnotations(infile, outfile): tmp = P.get_temp_filename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.open_file(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.open_file(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def make1basedCpgIslands(infile, outfile): # outfile, loadfile = outfiles out = IOTools.openFile(outfile, "w") out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi")) with IOTools.openFile(infile, "r") as f: lines = f.readlines() for line in lines: contig, start, stop = line.split() for position in [x for x in range(int(start), int(stop) + 2)]: out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland")) out.close()
def runRegexMotifSearch(infiles, outfile): '''run a regular expression search on sequences. compute counts. ''' motif = "[AG]G[GT]T[CG]A" reverse_motif = "T[GC]A[CA]C[TC]" controlfile, dbfile = infiles if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) motifs = [] for x in range(0, 15): motifs.append( ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE))) for x in range(0, 15): motifs.append(("ER%i" % x, re.compile(motif + "." * x + reverse_motif, re.IGNORECASE))) db_positions = Motifs.countMotifs(IOTools.open_file(dbfile, "r"), motifs) control_positions = Motifs.countMotifs(IOTools.open_file(controlfile, "r"), motifs) db_counts, control_counts = Motifs.getCounts( db_positions), Motifs.getCounts(control_positions) db_seqcounts, control_seqcounts = Motifs.getOccurances( db_positions), Motifs.getCounts(control_positions) ndb, ncontrol = len(db_positions), len(control_positions) outf = IOTools.open_file(outfile, "w") outf.write( "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n" ) for motif, pattern in motifs: try: fold = float(db_seqcounts[motif]) * \ ncontrol / (ndb * control_seqcounts[motif]) except ZeroDivisionError: fold = 0 outf.write( "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" % (motif, db_counts[motif], control_counts[motif], db_seqcounts[motif], IOTools.pretty_percent(db_seqcounts[motif], ndb), control_seqcounts[motif], IOTools.pretty_percent(control_seqcounts[motif], ncontrol), fold))
def GenotypeSNPs(infile, snplist, outfile): ''' Fetches the genotype from the variant tables for all samples for SNPs in the hapmap sample from makeRandomSNPSet. Complex sites are ignored (as simple SNPs are sufficient for these calculations). These are: Sites which failed QC (column 3 in the variant table is not PASS) Sites with more than 2 alleles defined (column 6 in the variant table contains more than one alternative allele) SNPs with more than one ID Indels ''' out = IOTools.open_file(outfile, "w") with IOTools.open_file(infile) as inf: for line in inf: line = line.strip().split() # if the variant passed QC if line[4] == "PASS": genotype = line[7] # if the genotype looks normal e.g. 1/1 if len(genotype) == 3: # get the actual genotype (rather than the index) if genotype[0] != ".": ind1 = int(genotype[0]) else: ind1 = 0 if genotype[2] != ".": ind2 = int(genotype[2]) else: ind2 = 0 A1 = line[5] A2 = line[6].split(",") AS = [A1] + A2 if len(AS) <= 2: GT = "%s%s" % (AS[ind1], AS[ind2]) refGT = "%s%s" % (A1, A1) if len(GT) == 2: if line[3][0:2] == "rs" and len( line[3].split(";")) == 1: snpid = line[3] chrom = line[0] pos = line[1] if snpid in snplist: out.write("%s\t%s\t%s\t%s\t%s\n" % (snpid, chrom, pos, GT, refGT)) out.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run, do not delete any files [%default]") parser.set_defaults(dry_run=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filenames = args c = E.Counter() for filename in filenames: c.checked += 1 if os.path.exists(filename + ".log"): if IOTools.isComplete(filename + ".log"): c.complete += 1 continue if IOTools.isComplete(filename): c.complete += 1 continue c.incomplete += 1 E.info('deleting %s' % filename) if options.dry_run: continue os.unlink(filename) c.deleted += 1 E.info(c) # write footer and output benchmark information. E.Stop()
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)