Beispiel #1
0
    def test_touch_file_creates_empty_file(self):
        self.assertFalse(os.path.exists(self.filename))
        iotools.touch_file(self.filename)
        self.assertTrue(os.path.exists(self.filename))
        if self.filename.endswith(".gz"):
            self.assertFalse(iotools.is_empty(self.filename))
        else:
            self.assertTrue(iotools.is_empty(self.filename))

        with iotools.open_file(self.filename) as inf:
            data = inf.read()
        self.assertEqual(len(data), 0)
Beispiel #2
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if iotools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        iotools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
    def test_logging_can_be_configured_from_file(self):

        log_config = os.path.join(self.work_dir, "logging.yml")

        with open(log_config, "w") as outf:
            outf.write("""
version: 1
formatters:
  default:
    '()': cgatcore.experiment.MultiLineFormatter
    format: '# %(asctime)s %(levelname)s %(module)s - %(message)s'
  with_app:
    '()': cgatcore.experiment.MultiLineFormatter
    format: '%(asctime)s %(levelname)s %(app_name)s %(module)s - %(message)s'
filters:
  name_filter:
    '()': cgatcore.pipeline.control.LoggingFilterpipelineName
    name: mypipeline_name
handlers:
  console:
    class: logging.StreamHandler
    formatter: default
    stream: ext://sys.stdout
    level: INFO
  second_stream:
    class: logging.FileHandler
    formatter: with_app
    filename: "extra.log"
    level: DEBUG
root:
  handlers: [console]
  level: INFO
loggers:
  cgatcore.pipeline:
    handlers: [second_stream]
    filters: [name_filter]
    level: DEBUG
""")

        retval, stdout, stderr = self.run_command(
            "python {}/template_pipeline.py make all --log-config-filename={}".
            format(ROOT, log_config))

        self.check_files(present=self.expected_output_files + ["extra.log"],
                         absent=["pipeline.log", "shell.log"])

        self.assertFalse(
            iotools.is_empty(os.path.join(self.work_dir, "extra.log")))

        with open(os.path.join(self.work_dir, "extra.log")) as inf:
            self.assertTrue("DEBUG" in inf.read())

        self.assertTrue("DEBUG" not in stdout)
Beispiel #4
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("motif\n")

    for infile in infiles:
        if iotools.is_empty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)
Beispiel #5
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\n")

    for infile in infiles:
        if iotools.is_empty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Beispiel #6
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("method\ttrack\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        method = re.match("(.+).dir/", infile).groups()[0]
        track = os.path.basename(".".join(infile.split(".")[:-1]))
        outf.write("%s\t%s\n" % (method,track))

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Beispiel #7
0
def loadMemeChipSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n")

    for infile in infiles:
        if iotools.is_empty(infile):
            continue
        fn = P.snip(os.path.basename(infile), ".memechip")

        track, npeaks, width, masking = fn.split(".")
        outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) +
                   "\n")

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Beispiel #8
0
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session):

    logger = P.get_logger()
    metric_table_filter = None
    if "metric_no_upload" in meta_data:
        if meta_data["metric_no_upload"] == "*":
            logger.warn("upload turned off for metric {}".format(
                meta_data["metric_name"]))
            return
        else:
            metric_table_filter = re.compile(meta_data["metric_no_upload"])

    # multiple tablenames for multiple metric output
    #
    # Tables are added into schemas to avoid cluttering
    # the public namespace.
    # (if only blobs, no metric output file)
    if "metric_output_files" in meta_data:
        assert len(meta_data["metric_output_files"]) == \
            len(meta_data["metric_tablenames"])

        for output_file, tablename in zip(
                meta_data["metric_output_files"],
                meta_data["metric_tablenames"]):

            if metric_table_filter and metric_table_filter.search(tablename):
                logger.warn("upload for table {} turned off".format(
                    tablename))
                continue

            if not os.path.exists(output_file):
                logger.warning("output file {} does not exist - ignored".format(
                    output_file))
                continue

            if IOTools.is_empty(output_file):
                logger.warn("output file {} is empty - ignored".format(
                    output_file))
                continue

            # table = pandas.DataFrame({"values": [1, 2]})
            try:
                table = pandas.read_csv(output_file,
                                        sep="\t",
                                        comment="#",
                                        skip_blank_lines=True)
            except ValueError as e:
                logger.warn("table {} can not be read: {}".format(
                    output_file, str(e)))
                continue
            except pandas.parser.CParserError as e:
                logger.warn("malformatted table {} can not be read: {}".format(
                    output_file, str(e)))
                continue

            if table.empty:
                logger.warn("table {} is empty - ignored".format(output_file))
                continue

            tablename, table, dtypes = transform_table_before_upload(tablename,
                                                                     table,
                                                                     instance_id,
                                                                     meta_data,
                                                                     table_cache)

            if schema is None:
                tn = tablename
            else:
                tn = "{}.{}".format(schema, tablename)

            # add foreign key
            table["instance_id"] = instance_id
            logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}")
            table_cache.add_table(table, tablename, dtypes)

    if "metric_blob_globs" in meta_data:
        metric_dir = meta_data["metric_outdir"]
        files = [glob.glob(os.path.join(metric_dir, x))
                 for x in meta_data["metric_blob_globs"]]
        files = IOTools.flatten(files)
        logger.debug(
            "uploading binary data in {} files from {} to "
            "table binary_data".format(len(files), metric_dir))
        table = []
        for fn in files:
            with IOTools.open_file(fn, "rb", encoding=None) as inf:
                data_row = BenchmarkBinaryData(
                    instance_id=instance_id,
                    filename=os.path.basename(fn),
                    path=fn,
                    data=inf.read())
                session.add(data_row)
            session.commit()

    if meta_data.get("metric_tableindices", None):
        table_cache.add_indices(meta_data["metric_tableindices"])
Beispiel #9
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if iotools.is_empty(dbfile) or len(motiffiles) == 0:
        iotools.touch_file(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if iotools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    P.run("gzip < %(tmpfile)s > %(outfile)s")

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Beispiel #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq", "downsample-single",
                               "downsample-paired", "add-sequence-error"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', "mapped", "unique", "non-unique",
                               "remove-list", "keep-list", "error-rate",
                               "min-read-length", "min-average-base-quality"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.add_option("--downsample",
                      dest="downsample",
                      type="int",
                      help="Number of reads to downsample to")

    parser.add_option(
        "--filename-read-list",
        dest="filename_read_list",
        type="string",
        help=
        "Filename with list of reads to filter if 'keep-list' or 'remove-list' "
        "filter method is chosen [%default]")

    parser.add_option(
        "--error-rate",
        dest="error_rate",
        type="float",
        help="error rate to use as filter. Reads with an error rate "
        "higher than the threshold will be removed [%default]")

    parser.add_option("--minimum-read-length",
                      dest="minimum_read_length",
                      type="int",
                      help="minimum read length when filtering [%default]")

    parser.add_option(
        "--minimum-average-base-quality",
        dest="minimum_average_base_quality",
        type="float",
        help="minimum average base quality when filtering [%default]")

    parser.set_defaults(
        methods=[],
        output_sam=False,
        reference_bam=None,
        filter_methods=[],
        strip_method="all",
        force=False,
        fastq_pair1=None,
        fastq_pair2=None,
        downsample=None,
        random_seed=None,
        filename_read_list=None,
        error_rate=None,
        minimum_read_length=0,
        minimum_average_base_quality=0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.stdin != sys.stdin:
        bamfile = options.stdin.name
    elif args:
        bamfile = args[0]
        if len(args) > 1:
            raise ValueError("multiple bam files provided in arguments")
    else:
        bamfile = "-"

    if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods:
        if "remove-list" in options.filter_methods and "keep-list" in options.filter_methods:
            raise ValueError(
                "it is not possible to specify remove-list and keep-list")

        with iotools.open_file(options.filename_read_list) as inf:
            filter_query_names = set(
                [x.strip() for x in inf.readlines() if not x.startswith("#")])
        E.info("read query_sequence filter list with {} read names".format(
            len(filter_query_names)))

    if "error-rate" in options.filter_methods and not options.error_rate:
        raise ValueError(
            "filtering by error-rate requires --error-rate to be set")

    if "add-sequence-error" in options.methods and not options.error_rate:
        raise ValueError("--add-error-rate requires --error-rate to be set")

    E.info('processing %s' % bamfile)
    if bamfile != "-" and iotools.is_empty(bamfile):
        E.warn('ignoring empty file %s' % bamfile)
        E.stop()
        return

    if options.stdout != sys.stdout:
        output_bamfile = options.stdout.name
    else:
        output_bamfile = "-"
        if options.stdlog == sys.stdout:
            raise ValueError(
                "redirect log-stream to file (--log) if outputting to stdout")

    if options.output_sam:
        output_mode = "wh"
    else:
        output_mode = "wb"

    # reading bam from stdin does not work with only the "r" tag
    with pysam.AlignmentFile(bamfile, "rb") as pysam_in:
        with pysam.AlignmentFile(output_bamfile,
                                 output_mode,
                                 template=pysam_in) as pysam_out:
            process_bam(pysam_in, pysam_out, options)

    # write footer and output benchmark information.
    E.stop()
Beispiel #11
0
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if iotools.is_empty(infiles[0]) or iotools.isEmpty(infiles[1]):
            iotools.touch_file(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run(statement)

    else:

        tmpfile = P.get_temp_filename(".")

        # need to merge incrementally
        fn = infiles[0]
        if iotools.is_empty(infiles[0]):
            iotools.touch_file(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run(statement)

        for fn in infiles[1:]:
            if iotools.is_empty(infiles[0]):
                iotools.touch_file(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run(statement)

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run(statement)

        os.unlink(tmpfile)