def test_touch_file_creates_empty_file(self): self.assertFalse(os.path.exists(self.filename)) iotools.touch_file(self.filename) self.assertTrue(os.path.exists(self.filename)) if self.filename.endswith(".gz"): self.assertFalse(iotools.is_empty(self.filename)) else: self.assertTrue(iotools.is_empty(self.filename)) with iotools.open_file(self.filename) as inf: data = inf.read() self.assertEqual(len(data), 0)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if iotools.is_empty(infile): E.warn("input is empty - no computation performed") iotools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def test_logging_can_be_configured_from_file(self): log_config = os.path.join(self.work_dir, "logging.yml") with open(log_config, "w") as outf: outf.write(""" version: 1 formatters: default: '()': cgatcore.experiment.MultiLineFormatter format: '# %(asctime)s %(levelname)s %(module)s - %(message)s' with_app: '()': cgatcore.experiment.MultiLineFormatter format: '%(asctime)s %(levelname)s %(app_name)s %(module)s - %(message)s' filters: name_filter: '()': cgatcore.pipeline.control.LoggingFilterpipelineName name: mypipeline_name handlers: console: class: logging.StreamHandler formatter: default stream: ext://sys.stdout level: INFO second_stream: class: logging.FileHandler formatter: with_app filename: "extra.log" level: DEBUG root: handlers: [console] level: INFO loggers: cgatcore.pipeline: handlers: [second_stream] filters: [name_filter] level: DEBUG """) retval, stdout, stderr = self.run_command( "python {}/template_pipeline.py make all --log-config-filename={}". format(ROOT, log_config)) self.check_files(present=self.expected_output_files + ["extra.log"], absent=["pipeline.log", "shell.log"]) self.assertFalse( iotools.is_empty(os.path.join(self.work_dir, "extra.log"))) with open(os.path.join(self.work_dir, "extra.log")) as inf: self.assertTrue("DEBUG" in inf.read()) self.assertTrue("DEBUG" not in stdout)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("motif\n") for infile in infiles: if iotools.is_empty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\n") for infile in infiles: if iotools.is_empty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.is_empty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method,track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if iotools.is_empty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session): logger = P.get_logger() metric_table_filter = None if "metric_no_upload" in meta_data: if meta_data["metric_no_upload"] == "*": logger.warn("upload turned off for metric {}".format( meta_data["metric_name"])) return else: metric_table_filter = re.compile(meta_data["metric_no_upload"]) # multiple tablenames for multiple metric output # # Tables are added into schemas to avoid cluttering # the public namespace. # (if only blobs, no metric output file) if "metric_output_files" in meta_data: assert len(meta_data["metric_output_files"]) == \ len(meta_data["metric_tablenames"]) for output_file, tablename in zip( meta_data["metric_output_files"], meta_data["metric_tablenames"]): if metric_table_filter and metric_table_filter.search(tablename): logger.warn("upload for table {} turned off".format( tablename)) continue if not os.path.exists(output_file): logger.warning("output file {} does not exist - ignored".format( output_file)) continue if IOTools.is_empty(output_file): logger.warn("output file {} is empty - ignored".format( output_file)) continue # table = pandas.DataFrame({"values": [1, 2]}) try: table = pandas.read_csv(output_file, sep="\t", comment="#", skip_blank_lines=True) except ValueError as e: logger.warn("table {} can not be read: {}".format( output_file, str(e))) continue except pandas.parser.CParserError as e: logger.warn("malformatted table {} can not be read: {}".format( output_file, str(e))) continue if table.empty: logger.warn("table {} is empty - ignored".format(output_file)) continue tablename, table, dtypes = transform_table_before_upload(tablename, table, instance_id, meta_data, table_cache) if schema is None: tn = tablename else: tn = "{}.{}".format(schema, tablename) # add foreign key table["instance_id"] = instance_id logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}") table_cache.add_table(table, tablename, dtypes) if "metric_blob_globs" in meta_data: metric_dir = meta_data["metric_outdir"] files = [glob.glob(os.path.join(metric_dir, x)) for x in meta_data["metric_blob_globs"]] files = IOTools.flatten(files) logger.debug( "uploading binary data in {} files from {} to " "table binary_data".format(len(files), metric_dir)) table = [] for fn in files: with IOTools.open_file(fn, "rb", encoding=None) as inf: data_row = BenchmarkBinaryData( instance_id=instance_id, filename=os.path.basename(fn), path=fn, data=inf.read()) session.add(data_row) session.commit() if meta_data.get("metric_tableindices", None): table_cache.add_indices(meta_data["metric_tableindices"])
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if iotools.is_empty(dbfile) or len(motiffiles) == 0: iotools.touch_file(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if iotools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) P.run("gzip < %(tmpfile)s > %(outfile)s") shutil.rmtree(tmpdir) os.unlink(tmpfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq", "downsample-single", "downsample-paired", "add-sequence-error"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', "mapped", "unique", "non-unique", "remove-list", "keep-list", "error-rate", "min-read-length", "min-average-base-quality"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--downsample", dest="downsample", type="int", help="Number of reads to downsample to") parser.add_option( "--filename-read-list", dest="filename_read_list", type="string", help= "Filename with list of reads to filter if 'keep-list' or 'remove-list' " "filter method is chosen [%default]") parser.add_option( "--error-rate", dest="error_rate", type="float", help="error rate to use as filter. Reads with an error rate " "higher than the threshold will be removed [%default]") parser.add_option("--minimum-read-length", dest="minimum_read_length", type="int", help="minimum read length when filtering [%default]") parser.add_option( "--minimum-average-base-quality", dest="minimum_average_base_quality", type="float", help="minimum average base quality when filtering [%default]") parser.set_defaults( methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, fastq_pair1=None, fastq_pair2=None, downsample=None, random_seed=None, filename_read_list=None, error_rate=None, minimum_read_length=0, minimum_average_base_quality=0, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.stdin != sys.stdin: bamfile = options.stdin.name elif args: bamfile = args[0] if len(args) > 1: raise ValueError("multiple bam files provided in arguments") else: bamfile = "-" if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods: if "remove-list" in options.filter_methods and "keep-list" in options.filter_methods: raise ValueError( "it is not possible to specify remove-list and keep-list") with iotools.open_file(options.filename_read_list) as inf: filter_query_names = set( [x.strip() for x in inf.readlines() if not x.startswith("#")]) E.info("read query_sequence filter list with {} read names".format( len(filter_query_names))) if "error-rate" in options.filter_methods and not options.error_rate: raise ValueError( "filtering by error-rate requires --error-rate to be set") if "add-sequence-error" in options.methods and not options.error_rate: raise ValueError("--add-error-rate requires --error-rate to be set") E.info('processing %s' % bamfile) if bamfile != "-" and iotools.is_empty(bamfile): E.warn('ignoring empty file %s' % bamfile) E.stop() return if options.stdout != sys.stdout: output_bamfile = options.stdout.name else: output_bamfile = "-" if options.stdlog == sys.stdout: raise ValueError( "redirect log-stream to file (--log) if outputting to stdout") if options.output_sam: output_mode = "wh" else: output_mode = "wb" # reading bam from stdin does not work with only the "r" tag with pysam.AlignmentFile(bamfile, "rb") as pysam_in: with pysam.AlignmentFile(output_bamfile, output_mode, template=pysam_in) as pysam_out: process_bam(pysam_in, pysam_out, options) # write footer and output benchmark information. E.stop()
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if iotools.is_empty(infiles[0]) or iotools.isEmpty(infiles[1]): iotools.touch_file(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run(statement) else: tmpfile = P.get_temp_filename(".") # need to merge incrementally fn = infiles[0] if iotools.is_empty(infiles[0]): iotools.touch_file(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run(statement) for fn in infiles[1:]: if iotools.is_empty(infiles[0]): iotools.touch_file(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run(statement) statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run(statement) os.unlink(tmpfile)