Example #1
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
Example #2
0
    def finalize(self):
        """Called by the pipeline at the termination of a run. By default,
        this function prints the location of the log-file if one was created
        during the run (e.g. if there were errors), and a summary of all nodes.
        """
        runtime = (self._end_time or 0) - (self._start_time or 0)

        if self.states[self.ERROR]:
            print_err("Done; but errors were detected ...")
        else:
            print_info("Done ...")

        print_info()
        rows = [("  Number of nodes:", sum(self.states)),
                ("  Number of done nodes:", self.states[self.DONE]),
                ("  Number of runable nodes:", self.states[self.RUNABLE]),
                ("  Number of queued nodes:", self.states[self.QUEUED]),
                ("  Number of outdated nodes:", self.states[self.OUTDATED]),
                ("  Number of failed nodes:", self.states[self.ERROR]),
                ("  Pipeline runtime:", _fmt_runtime(round(runtime)))]

        for line in text.padded_table(rows):
            print_info(line)

        print_info("\nUse --list-output-files to view status of output files.")

        logfile = paleomix.logger.get_logfile()
        if logfile:
            print_debug("Log-file located at %r" % (logfile,))

        print_info()
Example #3
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
Example #4
0
    def finalize(self):
        """Called by the pipeline at the termination of a run. By default,
        this function prints the location of the log-file if one was created
        during the run (e.g. if there were errors), and a summary of all nodes.
        """
        runtime = (self._end_time or 0) - (self.start_time or 0)

        if self.states[self.ERROR]:
            print_err("Done; but errors were detected ...")
        else:
            print_info("Done ...")

        print_info()
        rows = [("  Number of nodes:", sum(self.states)),
                ("  Number of done nodes:", self.states[self.DONE]),
                ("  Number of runable nodes:", self.states[self.RUNABLE]),
                ("  Number of queued nodes:", self.states[self.QUEUED]),
                ("  Number of outdated nodes:", self.states[self.OUTDATED]),
                ("  Number of failed nodes:", self.states[self.ERROR]),
                ("  Pipeline runtime:", _fmt_runtime(runtime))]

        for line in text.padded_table(rows):
            print_info(line)

        print_info("\nUse --list-output-files to view status of output files.")

        logfile = paleomix.logger.get_logfile()
        if logfile:
            print_debug("Log-file located at %r" % (logfile, ))

        print_info()
Example #5
0
def main(argv):
    try:
        config = zonkey_config.parse_config(argv)
        if config is None:
            return 1
    except zonkey_config.ConfigError, error:
        print_err(error)
        return 1
Example #6
0
def main(argv):
    try:
        config = zonkey_config.parse_config(argv)
        if config is None:
            return 1
    except zonkey_config.ConfigError, error:
        print_err(error)
        return 1
Example #7
0
def run(config, args, pipeline_variant):
    if pipeline_variant not in ("bam", "trim"):
        raise ValueError("Unexpected BAM pipeline variant (%r)"
                         % (pipeline_variant,))

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
            return 1
Example #8
0
def run(config, args, pipeline_variant):
    if pipeline_variant not in ("bam", "trim"):
        raise ValueError("Unexpected BAM pipeline variant (%r)" %
                         (pipeline_variant, ))

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error, ))
            return 1
Example #9
0
def read_sample_sheets(filenames):
    records = {}
    for root in filenames:
        if os.path.isdir(root):
            filename = os.path.join(root, _FILENAME)
        else:
            root, filename = os.path.split(root)[0], root

        if not os.path.exists(filename):
            print_err("ERROR: Could not find SampleSheet file: %r" % filename)
            return None

        sample_sheet = read_alignment_records(filename)
        if sample_sheet is None:
            return None

        for record in sample_sheet:
            record["Lane"] = int(record["Lane"])
            path = "%(SampleID)s_%(Index)s_L%(Lane)03i_R{Pair}_*.fastq.gz" \
                % record
            record["Path"] = select_path(os.path.join(root, path))
            key = "%(FCID)s_%(Lane)s" % record

            libraries = records.setdefault(record["SampleID"], {})
            barcodes = libraries.setdefault(record["Index"], {})
            barcodes.setdefault(key, []).append(path)

    # Clean up names; generate unique names for duplicate lanes
    for libraries in records.itervalues():
        for barcodes in libraries.itervalues():
            for key, paths in barcodes.items():
                if len(paths) == 1:
                    barcodes[key] = paths[0]
                    continue

                counter = 1
                for path in paths:
                    new_key = "%s_%i" % (key, counter)

                    while new_key in barcodes:
                        counter += 1
                        new_key = "%s_%i" % (key, counter)

                    barcodes[new_key] = path

                barcodes.pop(key)

    return records
Example #10
0
def read_sample_sheets(filenames):
    records = {}
    for root in filenames:
        if os.path.isdir(root):
            filename = os.path.join(root, _FILENAME)
        else:
            root, filename = os.path.split(root)[0], root

        if not os.path.exists(filename):
            print_err("ERROR: Could not find SampleSheet file: %r" % filename)
            return None

        sample_sheet = read_alignment_records(filename)
        if sample_sheet is None:
            return None

        for record in sample_sheet:
            record["Lane"] = int(record["Lane"])
            path = "%(SampleID)s_%(Index)s_L%(Lane)03i_R{Pair}_*.fastq.gz" \
                % record
            record["Path"] = select_path(os.path.join(root, path))
            key = "%(FCID)s_%(Lane)s" % record

            libraries = records.setdefault(record["SampleID"], {})
            barcodes = libraries.setdefault(record["Index"], {})
            barcodes.setdefault(key, []).append(path)

    # Clean up names; generate unique names for duplicate lanes
    for libraries in records.itervalues():
        for barcodes in libraries.itervalues():
            for key, paths in barcodes.items():
                if len(paths) == 1:
                    barcodes[key] = paths[0]
                    continue

                counter = 1
                for path in paths:
                    new_key = "%s_%i" % (key, counter)

                    while new_key in barcodes:
                        counter += 1
                        new_key = "%s_%i" % (key, counter)

                    barcodes[new_key] = path

                barcodes.pop(key)

    return records
Example #11
0
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    options, paths = parse_args(argv)
    records = {}
    for root in paths:
        if os.path.isdir(root):
            filename = os.path.join(root, _FILENAME)
        else:
            root, filename = os.path.split(root)[0], root

        if not os.path.exists(filename):
            print_err("ERROR: Could not find SampleSheet file: %r" % filename)
            return 1

        for record in read_alignment_records(filename):
            libraries = records.setdefault(record["SampleID"], {})
            barcodes = libraries.setdefault(record["Index"], [])

            record["Lane"] = int(record["Lane"])
            path = "%(SampleID)s_%(Index)s_L%(Lane)03i_R{Pair}_*.fastq.gz" \
                % record
            record["Path"] = select_path(os.path.join(root, path))
            barcodes.append(record)

    template = build_makefile(add_full_options=(pipeline == "bam"),
                              add_prefix_tmpl=(pipeline == "bam"))
    if options.minimal:
        template = strip_comments(template)

    print(template)

    for (sample, libraries) in records.iteritems():
        print("%s:" % sample)
        print("  %s:" % sample)
        for (library, barcodes) in libraries.iteritems():
            print("    %s:" % library)
            for record in barcodes:
                print("      {FCID}_{Lane}: {Path}".format(**record))
            print()
        print()

    if argv:
        print_info("Automatically generated makefile printed.\n"
                   "Please check for correctness before running pipeline.")
    return 0
Example #12
0
def read_alignment_records(filename):
    results = []
    with open(filename) as records:
        line = records.readline()
        if not line:
            print_err("ERROR: Empty SampleSheet.csv file: %r"
                      % (filename,))
            return None

        header = line.strip().split(",")
        missing = set(("SampleID", "Index", "Lane", "FCID")) - set(header)
        if missing:
            print_err("ERROR: Required columns missing from SampleSheet file "
                      "%r: %s" % (filename, ", ".join(map(repr, missing))))
            return None

        for idx, line in enumerate(records, start=2):
            line = line.strip()
            if not line:
                continue

            fields = line.split(",")
            if len(fields) != len(header):
                print_err("Line %i in SampleSheet file %r does not contain "
                          "the expected number of columns; expected %i, but "
                          "found %i."
                          % (idx, filename, len(header), len(fields)))
                return None

            results.append(dict(zip(header, fields)))

    return results
Example #13
0
def read_alignment_records(filename):
    results = []
    with open(filename) as records:
        line = records.readline()
        if not line:
            print_err("ERROR: Empty SampleSheet.csv file: %r" % (filename, ))
            return None

        header = line.strip().split(",")
        missing = set(("SampleID", "Index", "Lane", "FCID")) - set(header)
        if missing:
            print_err("ERROR: Required columns missing from SampleSheet file "
                      "%r: %s" % (filename, ", ".join(map(repr, missing))))
            return None

        for idx, line in enumerate(records, start=2):
            line = line.strip()
            if not line:
                continue

            fields = line.split(",")
            if len(fields) != len(header):
                print_err("Line %i in SampleSheet file %r does not contain "
                          "the expected number of columns; expected %i, but "
                          "found %i." %
                          (idx, filename, len(header), len(fields)))
                return None

            results.append(dict(zip(header, fields)))

    return results
Example #14
0
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    commands = ("makefile", "mkfile", "run",
                "dry_run", "dry-run", "dryrun",
                "remap", "example", "examples")

    if not argv or (argv[0] == "help"):
        _print_usage(pipeline)
        return 0
    elif argv[0] not in commands:
        _print_usage(pipeline)
        return 1
    elif argv[0] in ("mkfile", "makefile"):
        return bam_mkfile.main(argv[1:], pipeline=pipeline)
    elif argv[0] in ("remap", "remap_prefix"):
        # Import here to avoid circular dependency issues
        import paleomix.tools.bam_pipeline.remap as bam_remap

        return bam_remap.main(argv[1:])
    elif argv[0] in ("example", "examples"):
        return paleomix.resources.copy_example("bam_pipeline", argv[1:])

    try:
        config, args = bam_config.parse_config(argv, pipeline)

        if not args[1:]:
            print_err("Please specify at least one makefile!")
            print_err("Use --help for more information.")
            return 1
        elif args and args[0].startswith("dry"):
            config.dry_run = True
    except bam_config.ConfigError, error:
        print_err(error)
        return 1
Example #15
0
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    commands = ("makefile", "mkfile", "run", "dry_run", "dry-run", "dryrun",
                "remap", "example", "examples")

    if not argv or (argv[0] == "help"):
        _print_usage(pipeline)
        return 0
    elif argv[0] not in commands:
        _print_usage(pipeline)
        return 1
    elif argv[0] in ("mkfile", "makefile"):
        return bam_mkfile.main(argv[1:], pipeline=pipeline)
    elif argv[0] in ("remap", "remap_prefix"):
        # Import here to avoid circular dependency issues
        import paleomix.tools.bam_pipeline.remap as bam_remap

        return bam_remap.main(argv[1:])
    elif argv[0] in ("example", "examples"):
        return paleomix.resources.copy_example("bam_pipeline", argv[1:])

    try:
        config, args = bam_config.parse_config(argv, pipeline)

        if not args[1:]:
            print_err("Please specify at least one makefile!")
            print_err("Use --help for more information.")
            return 1
        elif args and args[0].startswith("dry"):
            config.dry_run = True
    except bam_config.ConfigError, error:
        print_err(error)
        return 1
Example #16
0
def main(argv):
    try:
        config, args = parse_config(argv)
    except ConfigError, error:
        print_err(error)
        return 1
Example #17
0

def run(config, args, pipeline_variant):
    if pipeline_variant not in ("bam", "trim"):
        raise ValueError("Unexpected BAM pipeline variant (%r)" %
                         (pipeline_variant, ))

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error, ))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'" %
                  (config.temp_root, ))
        return 1

    # Init worker-threads before reading in any more data
    pipeline = Pypeline(config)

    try:
        print_info("Reading makefiles ...")
        makefiles = read_makefiles(config, args, pipeline_variant)
    except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
        print_err("Error reading makefiles:",
                  "\n  %s:\n   " % (error.__class__.__name__, ),
                  "\n    ".join(str(error).split("\n")))
        return 1

    logfile_template = time.strftime("bam_pipeline.%Y%m%d_%H%M%S_%%02i.log")
Example #18
0

def run(config, args, pipeline_variant):
    if pipeline_variant not in ("bam", "trim"):
        raise ValueError("Unexpected BAM pipeline variant (%r)"
                         % (pipeline_variant,))

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'"
                  % (config.temp_root,))
        return 1

    # Init worker-threads before reading in any more data
    pipeline = Pypeline(config)

    try:
        print_info("Reading makefiles ...")
        makefiles = read_makefiles(config, args, pipeline_variant)
    except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
        print_err("Error reading makefiles:",
                  "\n  %s:\n   " % (error.__class__.__name__,),
                  "\n    ".join(str(error).split("\n")))
        return 1

    logfile_template = time.strftime("bam_pipeline.%Y%m%d_%H%M%S_%%02i.log")
Example #19
0
def main(argv):
    try:
        config, args = parse_config(argv)
    except ConfigError, error:
        print_err(error)
        return 1
Example #20
0
        # Update interpreter to match the one currently in use;
        # this is required since we may be running from a virtual env
        filename = os.path.join(argv[1], 'phylo_pipeline',
                                'synthesize_reads.py')

        with open(filename) as handle:
            header, lines = handle.read().split('\n', 1)

        with open(filename, 'w') as handle:
            handle.write('#!%s\n' % (os.path.abspath(sys.executable, )))
            handle.write(lines)

        return 0
    elif (len(args) < 2) and ("mkfile" not in args and "makefile" not in args):
        print_err("\nPlease specify at least one makefile!")
        return 1

    commands = select_commands(args.pop(0))
    if any((cmd in ("makefile", "mkfile")) for (cmd, _) in commands):
        return mkfile.main(args[1:])

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error, ))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'" %
Example #21
0
        # Update interpreter to match the one currently in use;
        # this is required since we may be running from a virtual env
        filename = os.path.join(argv[1],
                                'phylo_pipeline',
                                'synthesize_reads.py')

        with open(filename) as handle:
            header, lines = handle.read().split('\n', 1)

        with open(filename, 'w') as handle:
            handle.write('#!%s\n' % (os.path.abspath(sys.executable, )))
            handle.write(lines)

        return 0
    elif (len(args) < 2) and ("mkfile" not in args and "makefile" not in args):
        print_err("\nPlease specify at least one makefile!")
        return 1

    commands = select_commands(args.pop(0))
    if any((cmd in ("makefile", "mkfile")) for (cmd, _) in commands):
        return mkfile.main(args[1:])

    if not os.path.exists(config.temp_root):
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'"
Example #22
0
def setup_mito_mapping(config):
    genomes_root = os.path.join(config.destination, "genomes")
    if not os.path.exists(genomes_root):
        fileutils.make_dirs(genomes_root)

    mkfile_fpath = os.path.join(config.destination, "makefile.yaml")

    filenames = [mkfile_fpath]
    for name, record in sorted(config.database.mitochondria.iteritems()):
        filenames.append(os.path.join(genomes_root, "%s.fasta"
                                      % (record.name,)))

    existing_filenames = [filename for filename in filenames
                          if os.path.exists(filename)]

    # A bit strict, but avoid accidential overwrites
    if existing_filenames:
        print_err("ERROR: Output file(s) already exists, "
                  "cannot proceed:\n    %s"
                  % ("\n    ".join(map(repr, existing_filenames),)))

        return 1

    with open(mkfile_fpath, "w") as mkfile:
        mkfile.write(bam_mkfile.build_makefile(add_prefix_tmpl=False,
                                               add_sample_tmpl=False))

        mkfile.write("\n\nPrefixes:\n")

        for name, record in sorted(config.database.mitochondria.iteritems()):
            meta = (record.meta or "").upper()
            if "EXCLUDE" in meta:
                continue

            mkfile.write("  %s:\n" % (record.name,))
            mkfile.write("    Path: genomes/%s.fasta\n" % (record.name,))

            info = config.database.samples.get(record.name)
            if info is not None:
                mkfile.write("    # Group: %s\n"
                             % (info.get('Group(3)', 'NA'),))
                mkfile.write("    # Species: %s\n"
                             % (info.get('Species', 'NA'),))
                mkfile.write("    # Sex: %s\n"
                             % (info.get('Sex', 'NA'),))
                mkfile.write("    # Publication: %s\n"
                             % (info.get('Publication', 'NA'),))
                mkfile.write("    # Sample ID: %s\n"
                             % (info.get('SampleID', 'NA'),))

            mkfile.write('\n')

            fasta_fpath = os.path.join(genomes_root,
                                       "%s.fasta" % (record.name,))

            with open(fasta_fpath, "w") as fasta_handle:
                fasta_handle.write(str(record))
                fasta_handle.write("\n")

        mkfile.write("\n")

    return 0
Example #23
0
def setup_mito_mapping(config):
    genomes_root = os.path.join(config.destination, "genomes")
    if not os.path.exists(genomes_root):
        fileutils.make_dirs(genomes_root)

    mkfile_fpath = os.path.join(config.destination, "makefile.yaml")

    filenames = [mkfile_fpath]
    for name, record in sorted(config.database.mitochondria.iteritems()):
        filenames.append(os.path.join(genomes_root, "%s.fasta"
                                      % (record.name,)))

    existing_filenames = [filename for filename in filenames
                          if os.path.exists(filename)]

    # A bit strict, but avoid accidential overwrites
    if existing_filenames:
        print_err("ERROR: Output file(s) already exists, "
                  "cannot proceed:\n    %s"
                  % ("\n    ".join(map(repr, existing_filenames),)))

        return 1

    with open(mkfile_fpath, "w") as mkfile:
        mkfile.write(bam_mkfile.build_makefile(add_prefix_tmpl=False,
                                               add_sample_tmpl=False))

        mkfile.write("\n\nPrefixes:\n")

        for name, record in sorted(config.database.mitochondria.iteritems()):
            meta = (record.meta or "").upper()
            if "EXCLUDE" in meta:
                continue

            mkfile.write("  %s:\n" % (record.name,))
            mkfile.write("    Path: genomes/%s.fasta\n" % (record.name,))

            info = config.database.samples.get(record.name)
            if info is not None:
                mkfile.write("    # Group: %s\n"
                             % (info.get('Group(3)', 'NA'),))
                mkfile.write("    # Species: %s\n"
                             % (info.get('Species', 'NA'),))
                mkfile.write("    # Sex: %s\n"
                             % (info.get('Sex', 'NA'),))
                mkfile.write("    # Publication: %s\n"
                             % (info.get('Publication', 'NA'),))
                mkfile.write("    # Sample ID: %s\n"
                             % (info.get('SampleID', 'NA'),))

            mkfile.write('\n')

            fasta_fpath = os.path.join(genomes_root,
                                       "%s.fasta" % (record.name,))

            with open(fasta_fpath, "w") as fasta_handle:
                record = FASTA(
                    name=record.name,
                    meta=None,
                    sequence=record.sequence.replace('-', ''))

                fasta_handle.write(str(record))
                fasta_handle.write("\n")

        mkfile.write("\n")

    return 0