Example #1
def check_fastq_files(filenames, required_offset, allow_empty=False):
    for filename in filenames:
        qualities = _read_sequences(filename)
        offsets = fastq.classify_quality_strings(qualities)
        if offsets == fastq.OFFSET_BOTH:
            raise NodeError("FASTQ file contains quality scores with both "
                            "quality offsets (33 and 64); file may be "
                            "unexpected format or corrupt. Please ensure "
                            "that this file contains valid FASTQ reads from a "
                            "single source.\n    Filename = %r" % (filename, ))
        elif offsets == fastq.OFFSET_MISSING:
            if allow_empty and not qualities:

            raise NodeError("FASTQ file did not contain quality scores; file "
                            "may be unexpected format or corrupt. Ensure that "
                            "the file is a FASTQ file.\n    Filename = %r" %
                            (filename, ))
        elif offsets not in (fastq.OFFSET_AMBIGIOUS, required_offset):
            raise NodeError("FASTQ file contains quality scores with wrong "
                            "quality score offset (%i); expected reads with "
                            "quality score offset %i. Ensure that the "
                            "'QualityOffset' specified in the makefile "
                            "corresponds to the input.\n    Filename = %s" %
                            (offsets, required_offset, filename))
Example #2
def _collect_qualities(handle, filename):
    header = handle.readline()
    while header:
        sequence = handle.readline()
        seperator = handle.readline()
        qualities = handle.readline()

        if not header.startswith("@"):
            if header.startswith(">"):
                raise NodeError("Input file appears to be in FASTA format "
                                "(header starts with '>', expected '@'), "
                                "but only FASTQ files are supported\n"
                                "Filename = %r" % (filename, ))

            raise NodeError("Input file lacks FASTQ header (expected '@', "
                            "found %r), but only FASTQ files are supported\n"
                            "    Filename = %r" % (header[:1], filename))
        elif not qualities:
            raise NodeError("Partial record found; is not 4 lines long:\n"
                            "Filename = %r\n    Record = '%s'" %
                            (filename, header.rstrip()))
        elif not seperator.startswith("+"):
            raise NodeError("Input file lacks FASTQ seperator (expected '+', "
                            "found %r), but only FASTQ files are supported\n"
                            "    Filename = %r" % (seperator[:1], filename))
        elif len(sequence) != len(qualities):
            raise NodeError("Input file contains malformed FASTQ records; "
                            "length of sequence / qualities are not the "
                            "same.\n    Filename = %r\n    Record = '%s'" %
                            (filename, header.rstrip()))

        yield qualities
        header = handle.readline()
Example #3
def _validate_fasta_line(filename, linenum, line):
    invalid_chars = frozenset(line) - _VALID_CHARS
    if invalid_chars:
        if invalid_chars == frozenset('\r'):
            raise NodeError("FASTA file contains carriage-returns ('\\r')!\n"
                            "Please convert file to unix format, using e.g. "
                            "dos2unix.\n    Filename = %r\n" % (filename, ))

        raise NodeError("FASTA sequence contains invalid characters\n"
                        "    Filename = %r\n    Line = %r\n"
                        "    Invalid characters = %r" %
                        (filename, linenum, "".join(invalid_chars)))
Example #4
def _read_sequences(filename):
    """Collects the sequences from a PHYLIP file, and returns the header,
    the names of the sequences, and the sequences themselves. The parser
    supports interleaved sequences (as produced by the pipeline), or simple
    sequential (each paired name and sequence on a single line) as produced
    by RAxML's reduce functionality. PHYLIP files containing multiple entries
    are not supported."""
    line, header = " ", None
    with open(filename) as handle:
        # Find header
        num_sequences = num_bases = 0
        while line:
            line = handle.readline()
            if line.strip():
                header = line
                num_sequences, num_bases = map(int, line.split())

        names = [None for _ in xrange(num_sequences)]
        sequences = [[] for _ in xrange(num_sequences)]

        line_num = 0
        while line:
            line = handle.readline()
            line_strip = line.strip()
            if line_strip:
                # The first N sequences are expected to contain sample names
                index = line_num % num_sequences
                if line_num < num_sequences:
                    name, line_strip = line_strip.split(None, 1)
                    names[index] = name

                line_num += 1

    if len(sequences) != num_sequences:
        message = ("Expected %i sequences, but found %i in PHYLIP file:\n"
                   "    Filename = %r") % (num_sequences, len(sequences),
        raise NodeError(message)

    for (index, fragments) in enumerate(sequences):
        sequences[index] = "".join(fragments)
        if len(sequences[index]) != num_bases:
            message = ("Expected %ibp sequences, found %ibp sequence for %r\n"
                       " Filename = %r") % (num_bases, len(
                           sequences[index]), names[index], filename)
            raise NodeError(message)

    return header, names, sequences
Example #5
def _validate_fasta_header(filename, linenum, line, cache):
    name = line.split(" ", 1)[0][1:]
    if not name:
        raise NodeError("FASTA sequence must have non-empty name\n"
                        "    Filename = %r\n    Line = %r\n" %
                        (filename, linenum))
    elif not _RE_REF_NAME.match(name):
        raise NodeError("Invalid name for FASTA sequence: %r\n"
                        "    Filename = %r\n    Line = %r\n" %
                        (name, filename, linenum))
    elif name in cache:
        raise NodeError("FASTA sequences have identical name\n"
                        "    Filename = %r\n    Name = %r\n"
                        "    Line 1 = %r\n    Line 2 = %r\n" %
                        (filename, name, linenum, cache[name]))
    cache[name] = linenum
Example #6
def _validate_fasta_line(filename, linenum, line):
    invalid_chars = frozenset(line) - _VALID_CHARS
    if invalid_chars:
        raise NodeError("FASTA sequence contains invalid characters\n"
                        "    Filename = %r\n    Line = %r\n"
                        "    Invalid characters = %r" %
                        (filename, linenum, "".join(invalid_chars)))
Example #7
 def _teardown(self, config, temp):
     # Validate output from MAFFT
     output_file = reroot_path(temp, self._output_file)
     except MSAError, error:
         raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
Example #8
def _read_partitions(filename):
    """Read a partition file, as produced by the pipeline itself, and
    returns a list of tuples containing the (start, end) coordinates;
    each line is expected to follow the following format:

    DNA, Name = Start-End

    Multiple regions, or skips are not supported."""
    partitions = []
    with open(filename) as handle:
        for (line_num, line) in enumerate(handle):
            result = _RE_PARTITION.match(line.rstrip())
            if result:
                start, end = result.groups()
                result = _RE_PARTITION_SINGLE.match(line.rstrip())
                if not result:
                    message = ("Line %i in partitions file does not follow "
                               "expected format:\n"
                               "  Expected, either = 'DNA, Name = Start-End'\n"
                               "                or = 'DNA, Name = Start'\n"
                               "  Found = %r") % (line_num, line.rstrip())
                    raise NodeError(message)
                start, = result.groups()
                end = start

            partitions.append((int(start) - 1, int(end)))
    return partitions
Example #9
 def _report_failure(cls, bed, fragment):
     message = "Failed to extract region from " \
               "reference sequence at %s:%i-%i; got " \
               "%i bp, but expected %i bp." \
               % (bed.contig, bed.start, bed.end,
                  len(fragment), (bed.end - bed.start))
     raise NodeError(message)
Example #10
def _read_sequences(filename):
    cat_call = factory.new("cat")
    cat_call.add_multiple_values((filename, ))
    cat_call = cat_call.finalized_call

    cat = None
        cat = subprocess.Popen(cat_call,
        qualities = _collect_qualities(cat.stdout, filename)

        return sampling.reservoir_sampling(qualities, 100000)
        if cat:
            cat = None
        rc_cat = cat.wait() if cat else 0
        if rc_cat:
            message = "Error running 'paleomix cat':\n" \
                      "  Unicat return-code = %i\n\n%s" \
                      % (rc_cat, cat.stderr.read())
            raise NodeError(message)
Example #11
def _check_bwa_prefix(prefix):
        bwa_version = BWA_VERSION.version
    except versions.VersionRequirementError:
        return  # Ignored here, reported elsewhere

    if bwa_version >= (0, 6, 0):
        for extension in (".rbwt", ".rpac", ".rsa"):
            if os.path.exists(prefix + extension):
                raise NodeError("BWA version is v%s, but prefix appears to be created using v0.5.x!\n"
                                "\tPlease remove '%s.*' and rebuild index using 'bwa index %s'" \
                                % (".".join(map(str, bwa_version)), prefix, prefix))
Example #12
def _read_sequences(filenames):
    expected_groups = None
    for filename in sorted(filenames):
        msa  = read_msa(filename)

        if not expected_groups:
            expected_groups = set(msa)
        elif set(msa) != expected_groups:
            difference = expected_groups.symmetric_difference(msa)
            raise NodeError("Unexpected/missing groups for sequence (%s): %s" \
                                % (filename, ", ".join(difference)))

        yield (filename, msa)
Example #13
 def _run(self, config, temp):
         CommandNode._run(self, config, temp)
     except NodeError, error:
         err_message = "DNA damage levels are too low"
         if self._command.join() == [1]:
             fpath = os.path.join(temp, "pipe_mapDamage.stdout")
             with open(fpath) as handle:
                 for line in handle:
                     if err_message in line:
                         line = line.strip().replace("Warning:", "ERROR:")
                         error = NodeError("%s\n\n%s" % (error, line))
         raise error
Example #14
def test_run__exceptions():
    cfg_mock = flexmock(temp_root = "/tmp")
    def build_tests(key, exception, expectation):
        def test_function():
            node_mock = flexmock(Node())
            with MonkeypatchCreateTempDir():
                node_mock.run(cfg_mock) # pylint: disable=E1103

        return test_function

    for key in ('_setup', '_run', '_teardown'):
        yield build_tests(key, TypeError("The castle AAARGH!"), NodeUnhandledException)
        yield build_tests(key, NodeError("He's a very naughty boy!"), NodeError)
Example #15
def _check_bwa_prefix(prefix):
    """Checks that a given prefix is compatible with the currently
    installed version of BWA. This is required in order to allow
    auto-indexing of prefixes, as indexes produced by v0.5.x and
    by 0.6+ are not only incompatible, but differs in the files
    produced, with 0.5.x producing a handful of additional files.

    As a consequence, simply using normal input-file dependencies
    would result in prefixes being re-indexed if the version of
    BWA was changed from 0.6+ to 0.5.x, and in failures during
    runtime if the version was changed from 0.5.x to 0.6+.

    This function treats that a difference in the version of BWA
    installed and the version implied by the prefix files is an
    error, and therefore requires user intervention."""
    if prefix in _PREFIXES_CHECKED:

        bwa_version = BWA_VERSION.version
    except versions.VersionRequirementError:
        return  # Ignored here, reported elsewhere

    # Files unique to v0.5.x
    v05x_files = set((prefix + ext) for ext in (".rbwt", ".rpac", ".rsa"))
    # Files common to v0.5.x, v0.6.x, and v0.7.x
    common_files = set(
        (prefix + ext) for ext in (".amb", ".ann", ".bwt", ".pac", ".sa"))
    all_files = v05x_files | common_files
    current_files = all_files - set(missing_files(all_files))

    expected_version = None
    if (current_files & common_files):
        if bwa_version >= (0, 6, 0):
            if (current_files & v05x_files):
                expected_version = "v0.5.x"
        elif bwa_version < (0, 6, 0):
            if not (current_files & v05x_files):
                expected_version = "v0.6.x or later"

    if expected_version:
        raise NodeError("BWA version is v%s, but prefix appears to be created using %s!\n"
                        "  Your copy of BWA may have changed, or you may be using the wrong\n"
                        "  prefix. To resolve this issue, either change your prefix, re-install\n"
                        "  BWA %s, or remove the prefix files at\n"
                        "    $ ls %s.*" \
                        % (".".join(map(str, bwa_version)), expected_version, expected_version, prefix))
Example #16
    def _setup(self, _config, _temp):
        for filename in self._infiles.itervalues():
            with open(filename + ".fai") as handle:
                sequences = set()
                for line in handle:
                    sequences.add(line.split("\t", 1)[0])

                missing_sequences = list(self._sequences - sequences)
                if missing_sequences:
                    if len(missing_sequences) >= 4:
                        missing_sequences = missing_sequences[:3]

                    message = ("FASTA file does not contain expected "
                               "sequences:\n  File =  %r\n  "
                               "Sequences = %s\n") \
                        % (filename, ", ".join(missing_sequences))
                    raise NodeError(message)
Example #17
def test_run__error_log__node_error():
    def _do_test_run__error_log__node_error(temp_folder, exception):
        temp = os.path.join(temp_folder, "xTMPx")
        cfg_mock = flexmock(temp_root=temp_folder)
        node_mock = flexmock(Node())
        node_mock.should_receive("_create_temp_dir").with_args(cfg_mock) \

        assert_raises(NodeError, node_mock.run, cfg_mock)  # pylint: disable=E1103
        log_file = os.path.join(temp_folder, "xTMPx", "pipe.errors")
        assert os.path.exists(log_file)
        assert_in("Errors =", get_file_contents(log_file))

    yield _do_test_run__error_log__node_error, NodeError("ARGH!")
    yield _do_test_run__error_log__node_error, OSError("ARGH!")
Example #18
    def customize(cls,
        aln = _bowtie2_template(
            ("bowtie2", ),
            # Setting IN_FILE_2 to None makes AtomicCmd ignore this key
            IN_FILE_2=input_file_2 or None,
        aln.set_option("-x", prefix)

        if input_file_1 and not input_file_2:
            aln.set_option("-U", "%(IN_FILE_1)s")
        elif input_file_1 and input_file_2:
            aln.set_option("-1", "%(IN_FILE_1)s")
            aln.set_option("-2", "%(IN_FILE_2)s")
            raise NodeError(
                "Input 1, OR both input 1 and input 2 must be specified for Bowtie2 node"

        max_threads = _get_max_threads(reference, threads)
        aln.set_option("--threads", max_threads)

        order, commands = _process_output(aln,
                                                       and input_file_2))
        commands["aln"] = aln

        return {
            "commands": commands,
            "order": ["aln"] + order,
            "threads": max_threads,
            "dependencies": dependencies
Example #19
    def _read_coverage_tables(cls, key, filenames):
        hits = nts = 0
        for filename in filenames:
            subtable = {}
            read_coverage_table(subtable, filename)
            contigtables = get_in(subtable, key)

            if contigtables is None:
                raise NodeError("Error reading table %r; row not found:"
                                "\n   %s   ...\n\nIf files have been renamed "
                                "during the run, then please remove this file "
                                "in that it may be re-generated.\nHowever, "
                                "note that read-group tags in the BAM files "
                                "may not be correct!"
                                % (filename, "   ".join(key)))

            for contigtable in contigtables.itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]
        return hits, nts
Example #20
def test_run__error_log__node_error():
    def _do_test_run__error_log__node_error(temp_folder, exception):
        cfg_mock = flexmock(temp_root = temp_folder)
        node_mock = flexmock(Node())

            os.mkdir(os.path.join(temp_folder, "xTMPx"))
            with MonkeypatchCreateTempDir(root = temp_folder, subfolder = "xTMPx"):
                # pylint: disable=E1103
                node_mock.run(cfg_mock) # pragma: no coverage
        except NodeError:
            log_file = os.path.join(temp_folder, "xTMPx", "pipe.errors")
            assert os.path.exists(log_file)
            assert_in("Errors =", get_file_contents(log_file))
        assert False # pragma: no coverage
    yield _do_test_run__error_log__node_error, NodeError("ARGH!")
    yield _do_test_run__error_log__node_error, OSError("ARGH!")
Example #21
def test_run__exceptions():
    cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)

    def build_tests(key, exception, expectation):
        def test_function():
            node_mock = flexmock(Node())
            node_mock.should_receive('_create_temp_dir').with_args(cfg_mock) \

            assert_raises(expectation, node_mock.run, cfg_mock)  # pylint: disable=E1103

        return test_function

    print "foo"
    for key in ('_setup', '_run', '_teardown'):
        yield build_tests(key, TypeError("The castle AAARGH!"),
        yield build_tests(key, NodeError("He's a very naughty boy!"),
Example #22
    def _process_reads(cls, observed_reads, output_files):
        for ((name, _, _), fpaths) in observed_reads.iteritems():
            if len(fpaths) > 1:
                message = ["Read %r found in multiple files:" % (name, )]
                for fpath in fpaths:
                    message.append("  - %r" % (fpath, ))
                    "This indicates that the same data files have "
                    "been included multiple times in the project. "
                    "Please review the input files used in this "
                    "project, to ensure that each set of data is "
                    "included only once.\n\n"
                    "If this is not the case, then execute the "
                    "following command(s) to mark this test as having "

                for fpath in output_files:
                    message.append("$ touch '%s'" % (fpath, ))

                raise NodeError("\n".join(message))
Example #23
def check_fasta_file(filename):
    with open(filename) as handle:
        namecache = {}
        state, linelength, linelengthchanged = _NA, None, False
        for linenum, line in enumerate(handle, start=1):
            line = line.rstrip('\n\r')

            if not line:
                if state in (_NA, _IN_WHITESPACE):
                elif state == _IN_HEADER:
                    raise NodeError("Expected FASTA sequence, found empty line"
                                    "\n    Filename = %r\n    Line = %r" %
                                    (filename, linenum))
                elif state == _IN_SEQUENCE:
                    state = _IN_WHITESPACE
                    assert False
            elif line.startswith(">"):
                if state in (_NA, _IN_SEQUENCE, _IN_WHITESPACE):
                    _validate_fasta_header(filename, linenum, line, namecache)
                    state = _IN_HEADER
                    linelength = None
                    linelengthchanged = False
                elif state == _IN_HEADER:
                    raise NodeError("Empty sequences not allowed\n"
                                    "    Filename = %r\n    Line = %r" %
                                    (filename, linenum - 1))
                    assert False
                if state == _NA:
                    raise NodeError("Expected FASTA header, found %r\n"
                                    "    Filename = %r\n    Line = %r" %
                                    (line, filename, linenum))
                elif state == _IN_HEADER:
                    _validate_fasta_line(filename, linenum, line)
                    linelength = len(line)
                    state = _IN_SEQUENCE
                elif state == _IN_SEQUENCE:
                    _validate_fasta_line(filename, linenum, line)
                    # If the length has changed, then that line must be the
                    # last line in the record, which may be shorter due to the
                    # sequence length. This is because the FAI index format
                    # expects that each line has the same length.
                    if linelengthchanged or (linelength < len(line)):
                        raise NodeError("Lines in FASTQ files must be of same "
                                        "length\n    Filename = %r\n"
                                        "    Line = %r" % (filename, linenum))
                    elif linelength != len(line):
                        linelengthchanged = True
                elif state == _IN_WHITESPACE:
                    raise NodeError("Empty lines not allowed in sequences\n"
                                    "    Filename = %r\n    Line = %r" %
                                    (filename, linenum))
                    assert False

        if state in (_NA, _IN_HEADER):
            raise NodeError("File does not contain any sequences"
                            "    Filename = %r" % (filename, ))