Exemple #1
0
def files_re_match(file1, file2, attributes=None):
    """Check the contents of 2 files for differences using re.match."""
    attributes = attributes or {}
    join_char = ''
    to_strip = os.linesep
    compressed_formats = get_compressed_formats(attributes)
    try:
        with get_fileobj(file2, compressed_formats=compressed_formats) as fh:
            history_data = fh.readlines()
        with get_fileobj(file1, compressed_formats=compressed_formats) as fh:
            local_file = fh.readlines()
    except UnicodeDecodeError:
        join_char = b''
        to_strip = os.linesep.encode('utf-8')
        with open(file2, 'rb') as fh:
            history_data = fh.readlines()
        with open(file1, 'rb') as fh:
            local_file = fh.readlines()
    assert len(local_file) == len(history_data), 'Data File and Regular Expression File contain a different number of lines (%d != %d)\nHistory Data (first 40 lines):\n%s' % (len(local_file), len(history_data), join_char.join(history_data[:40]))
    if attributes.get('sort', False):
        history_data.sort()
    lines_diff = int(attributes.get('lines_diff', 0))
    line_diff_count = 0
    diffs = []
    for regex_line, data_line in zip(local_file, history_data):
        regex_line = regex_line.rstrip(to_strip)
        data_line = data_line.rstrip(to_strip)
        if not re.match(regex_line, data_line):
            line_diff_count += 1
            diffs.append(f'Regular Expression: {regex_line}, Data file: {data_line}\n')
    if line_diff_count > lines_diff:
        raise AssertionError("Regular expression did not match data file (allowed variants=%i):\n%s" % (lines_diff, "".join(diffs)))
Exemple #2
0
def files_contains(file1, file2, attributes=None):
    """Check the contents of file2 for substrings found in file1, on a per-line basis."""
    # TODO: allow forcing ordering of contains
    attributes = attributes or {}
    to_strip = os.linesep
    compressed_formats = get_compressed_formats(attributes)
    try:
        with get_fileobj(file2, compressed_formats=compressed_formats) as fh:
            history_data = fh.read()
        with get_fileobj(file1, compressed_formats=compressed_formats) as fh:
            local_file = fh.readlines()
    except UnicodeDecodeError:
        to_strip = os.linesep.encode('utf-8')
        with open(file2, 'rb') as fh:
            history_data = fh.read()
        with open(file1, 'rb') as fh:
            local_file = fh.readlines()
    lines_diff = int(attributes.get('lines_diff', 0))
    line_diff_count = 0
    for contains in local_file:
        contains = contains.rstrip(to_strip)
        if contains not in history_data:
            line_diff_count += 1
        if line_diff_count > lines_diff:
            raise AssertionError(f"Failed to find '{contains}' in history data. (lines_diff={lines_diff}).")
Exemple #3
0
def get_headers(fname,
                sep,
                count=60,
                is_multi_byte=False,
                comment_designator=None):
    """
    Returns a list with the first 'count' lines split by 'sep', ignoring lines
    starting with 'comment_designator'

    >>> fname = get_test_fname('complete.bed')
    >>> get_headers(fname,'\\t')
    [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
    >>> fname = get_test_fname('test.gff')
    >>> get_headers(fname, '\\t', count=5, comment_designator='#')
    [[''], ['chr7', 'bed2gff', 'AR', '26731313', '26731437', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731491', '26731536', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731541', '26731649', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731659', '26731841', '.', '+', '.', 'score']]
    """
    headers = []
    with compression_utils.get_fileobj(fname) as in_file:
        idx = 0
        for line in in_file:
            line = line.rstrip('\n\r')
            if is_multi_byte:
                # TODO: fix this - sep is never found in line
                line = unicodify(line, 'utf-8')
                sep = sep.encode('utf-8')
                if comment_designator is not None and comment_designator != '':
                    comment_designator = comment_designator.encode('utf-8')
            if comment_designator is not None and comment_designator != '' and line.startswith(
                    comment_designator):
                continue
            headers.append(line.split(sep))
            idx += 1
            if idx == count:
                break
    return headers
Exemple #4
0
def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True ):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH

    >>> fname = get_test_fname('4.bed')
    >>> get_file_peek(fname, LINE_COUNT=1)
    u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n'
    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    if skipchars is None:
        skipchars = []
    lines = []
    count = 0
    file_type = None
    data_checked = False
    temp = compression_utils.get_fileobj( file_name, "U" )
    try:
        while count < LINE_COUNT:
            line = temp.readline( WIDTH )
            if line and not is_multi_byte and not data_checked:
                # See if we have a compressed or binary file
                for char in line:
                    if ord( char ) > 128:
                        file_type = 'binary'
                        break
                data_checked = True
                if file_type == 'binary':
                    break
            if not line_wrap:
                if line.endswith('\n'):
                    line = line[:-1]
                else:
                    while True:
                        i = temp.read(1)
                        if not i or i == '\n':
                            break
            skip_line = False
            for skipchar in skipchars:
                if line.startswith( skipchar ):
                    skip_line = True
                    break
            if not skip_line:
                lines.append( line )
                count += 1
    finally:
        temp.close()
    if file_type == 'binary':
        text = "%s file" % file_type
    else:
        try:
            text = util.unicodify( '\n'.join( lines ) )
        except UnicodeDecodeError:
            text = "binary/unknown file"
    return text
Exemple #5
0
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH.

    :param is_multi_byte: deprecated
    :type  is_multi_byte: bool

    >>> def assert_peek_is(file_name, expected, *args, **kwd):
    ...     path = get_test_fname(file_name)
    ...     peek = get_file_peek(path, *args, **kwd)
    ...     assert peek == expected, "%s != %s" % (peek, expected)
    >>> assert_peek_is('0_nonewline', u'0')
    >>> assert_peek_is('0.txt', u'0\\n')
    >>> assert_peek_is('4.bed', u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n', LINE_COUNT=1)
    >>> assert_peek_is('1.bed', u'chr1\\t147962192\\t147962580\\tCCDS989.1_cds_0_0_chr1_147962193_r\\t0\\t-\\nchr1\\t147984545\\t147984630\\tCCDS990.1_cds_0_0_chr1_147984546_f\\t0\\t+\\n', LINE_COUNT=2)
    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    if skipchars is None:
        skipchars = []
    lines = []
    count = 0

    last_line_break = False
    with compression_utils.get_fileobj(file_name, "U") as temp:
        while count < LINE_COUNT:
            try:
                line = temp.readline(WIDTH)
            except UnicodeDecodeError:
                return "binary file"
            if line == "":
                break
            last_line_break = False
            if line.endswith('\n'):
                line = line[:-1]
                last_line_break = True
            elif not line_wrap:
                while True:
                    i = temp.read(1)
                    if i == '\n':
                        last_line_break = True
                    if not i or i == '\n':
                        break
            skip_line = False
            for skipchar in skipchars:
                if line.startswith(skipchar):
                    skip_line = True
                    break
            if not skip_line:
                lines.append(line)
                count += 1
    return '\n'.join(lines) + ('\n' if last_line_break else '')
def iter_headers(fname, sep, count=60, comment_designator=None):
    with compression_utils.get_fileobj(fname) as in_file:
        idx = 0
        for line in in_file:
            line = line.rstrip('\n\r')
            if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator):
                continue
            yield line.split(sep)
            idx += 1
            if idx == count:
                break
Exemple #7
0
def files_re_match_multiline(file1, file2, attributes=None):
    """Check the contents of 2 files for differences using re.match in multiline mode."""
    attributes = attributes or {}
    join_char = ''
    compressed_formats = get_compressed_formats(attributes)
    try:
        with get_fileobj(file2, compressed_formats=compressed_formats) as fh:
            history_data = fh.readlines()
        with get_fileobj(file1, compressed_formats=compressed_formats) as fh:
            local_file = fh.read()
    except UnicodeDecodeError:
        join_char = b''
        with open(file2, 'rb') as fh:
            history_data = fh.readlines()
        with open(file1, 'rb') as fh:
            local_file = fh.read()
    if attributes.get('sort', False):
        history_data.sort()
    history_data = join_char.join(history_data)
    # lines_diff not applicable to multiline matching
    assert re.match(local_file, history_data, re.MULTILINE), "Multiline Regular expression did not match data file"
Exemple #8
0
 def count_data_lines(self, dataset):
     """
     Count the number of lines of data in dataset,
     skipping all blank lines and comments.
     """
     data_lines = 0
     with compression_utils.get_fileobj(dataset.file_name) as in_file:
         for line in in_file:
             line = line.strip()
             if line and not line.startswith('#'):
                 data_lines += 1
     return data_lines
Exemple #9
0
 def estimate_file_lines(self, dataset):
     """
     Perform a rough estimate by extrapolating number of lines from a small read.
     """
     sample_size = 1048576
     try:
         with compression_utils.get_fileobj(dataset.file_name) as dataset_fh:
             dataset_read = dataset_fh.read(sample_size)
         sample_lines = dataset_read.count('\n')
         return int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
     except UnicodeDecodeError:
         log.error(f'Unable to estimate lines in file {dataset.file_name}')
         return None
Exemple #10
0
def iter_headers(fname_or_file_prefix, sep, count=60, comment_designator=None):
    idx = 0
    if isinstance(fname_or_file_prefix, FilePrefix):
        file_iterator = fname_or_file_prefix.line_iterator()
    else:
        file_iterator = compression_utils.get_fileobj(fname_or_file_prefix)
    for line in file_iterator:
        line = line.rstrip('\n\r')
        if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator):
            continue
        yield line.split(sep)
        idx += 1
        if idx == count:
            break
Exemple #11
0
def iter_headers(fname_or_file_prefix, sep, count=60, comment_designator=None):
    idx = 0
    if isinstance(fname_or_file_prefix, FilePrefix):
        file_iterator = fname_or_file_prefix.line_iterator()
    else:
        file_iterator = compression_utils.get_fileobj(fname_or_file_prefix)
    for line in file_iterator:
        line = line.rstrip('\n\r')
        if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator):
            continue
        yield line.split(sep)
        idx += 1
        if idx == count:
            break
Exemple #12
0
def iter_headers(fname, sep, count=60, is_multi_byte=False, comment_designator=None):
    with compression_utils.get_fileobj(fname) as in_file:
        idx = 0
        for line in in_file:
            line = line.rstrip('\n\r')
            if is_multi_byte:
                # TODO: fix this - sep is never found in line
                line = unicodify(line, 'utf-8')
                sep = sep.encode('utf-8')
                if comment_designator is not None and comment_designator != '':
                    comment_designator = comment_designator.encode('utf-8')
            if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator):
                continue
            yield line.split(sep)
            idx += 1
            if idx == count:
                break
Exemple #13
0
 def count_data_lines(self, dataset):
     """
     Count the number of lines of data in dataset,
     skipping all blank lines and comments.
     """
     data_lines = 0
     with compression_utils.get_fileobj(dataset.file_name) as in_file:
         # FIXME: Potential encoding issue can prevent the ability to iterate over lines
         # causing set_meta process to fail otherwise OK jobs. A better solution than
         # a silent try/except is desirable.
         try:
             for line in in_file:
                 line = line.strip()
                 if line and not line.startswith('#'):
                     data_lines += 1
         except Exception:
             pass
     return data_lines
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH.

    :param is_multi_byte: deprecated
    :type  is_multi_byte: bool

    >>> fname = get_test_fname('4.bed')
    >>> get_file_peek(fname, LINE_COUNT=1)
    u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n'
    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    if skipchars is None:
        skipchars = []
    lines = []
    count = 0
    with compression_utils.get_fileobj(file_name, "U") as temp:
        while count < LINE_COUNT:
            try:
                line = temp.readline(WIDTH)
            except UnicodeDecodeError:
                return "binary file"
            if not line_wrap:
                if line.endswith('\n'):
                    line = line[:-1]
                else:
                    while True:
                        i = temp.read(1)
                        if not i or i == '\n':
                            break
            skip_line = False
            for skipchar in skipchars:
                if line.startswith(skipchar):
                    skip_line = True
                    break
            if not skip_line:
                lines.append(line)
                count += 1
    return '\n'.join(lines)
Exemple #15
0
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH.

    :param is_multi_byte: deprecated
    :type  is_multi_byte: bool

    >>> fname = get_test_fname('4.bed')
    >>> get_file_peek(fname, LINE_COUNT=1)
    u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n'
    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    if skipchars is None:
        skipchars = []
    lines = []
    count = 0
    with compression_utils.get_fileobj(file_name, "U") as temp:
        while count < LINE_COUNT:
            try:
                line = temp.readline(WIDTH)
            except UnicodeDecodeError:
                return "binary file"
            if not line_wrap:
                if line.endswith('\n'):
                    line = line[:-1]
                else:
                    while True:
                        i = temp.read(1)
                        if not i or i == '\n':
                            break
            skip_line = False
            for skipchar in skipchars:
                if line.startswith(skipchar):
                    skip_line = True
                    break
            if not skip_line:
                lines.append(line)
                count += 1
    return '\n'.join(lines)
Exemple #16
0
 def count_data_lines(self, dataset):
     """
     Count the number of lines of data in dataset,
     skipping all blank lines and comments.
     """
     CHUNK_SIZE = 2 ** 15  # 32Kb
     data_lines = 0
     with compression_utils.get_fileobj(dataset.file_name) as in_file:
         # FIXME: Potential encoding issue can prevent the ability to iterate over lines
         # causing set_meta process to fail otherwise OK jobs. A better solution than
         # a silent try/except is desirable.
         try:
             for line in iter_start_of_line(in_file, CHUNK_SIZE):
                 line = line.strip()
                 if line and not line.startswith('#'):
                     data_lines += 1
         except UnicodeDecodeError:
             log.error(f'Unable to count lines in file {dataset.file_name}')
             return None
     return data_lines
Exemple #17
0
def iter_headers(fname,
                 sep,
                 count=60,
                 is_multi_byte=False,
                 comment_designator=None):
    with compression_utils.get_fileobj(fname) as in_file:
        idx = 0
        for line in in_file:
            line = line.rstrip('\n\r')
            if is_multi_byte:
                # TODO: fix this - sep is never found in line
                line = unicodify(line, 'utf-8')
                sep = sep.encode('utf-8')
                if comment_designator is not None and comment_designator != '':
                    comment_designator = comment_designator.encode('utf-8')
            if comment_designator is not None and comment_designator != '' and line.startswith(
                    comment_designator):
                continue
            yield line.split(sep)
            idx += 1
            if idx == count:
                break
Exemple #18
0
def get_headers( fname, sep, count=60, is_multi_byte=False ):
    """
    Returns a list with the first 'count' lines split by 'sep'

    >>> fname = get_test_fname('complete.bed')
    >>> get_headers(fname,'\\t')
    [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
    """
    headers = []
    in_file = compression_utils.get_fileobj(fname)
    try:
        for idx, line in enumerate(in_file):
            line = line.rstrip('\n\r')
            if is_multi_byte:
                # TODO: fix this - sep is never found in line
                line = unicodify( line, 'utf-8' )
                sep = sep.encode( 'utf-8' )
            headers.append( line.split(sep) )
            if idx == count:
                break
    finally:
        in_file.close()
    return headers
Exemple #19
0
def get_file_peek(file_name,
                  is_multi_byte=False,
                  WIDTH=256,
                  LINE_COUNT=5,
                  skipchars=None,
                  line_wrap=True):
    """
    Returns the first LINE_COUNT lines wrapped to WIDTH

    >>> fname = get_test_fname('4.bed')
    >>> get_file_peek(fname, LINE_COUNT=1)
    u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n'
    """
    # Set size for file.readline() to a negative number to force it to
    # read until either a newline or EOF.  Needed for datasets with very
    # long lines.
    if WIDTH == 'unlimited':
        WIDTH = -1
    if skipchars is None:
        skipchars = []
    lines = []
    count = 0
    file_type = None
    data_checked = False
    temp = compression_utils.get_fileobj(file_name, "U")
    try:
        while count < LINE_COUNT:
            line = temp.readline(WIDTH)
            if line and not is_multi_byte and not data_checked:
                # See if we have a compressed or binary file
                for char in line:
                    if ord(char) > 128:
                        file_type = 'binary'
                        break
                data_checked = True
                if file_type == 'binary':
                    break
            if not line_wrap:
                if line.endswith('\n'):
                    line = line[:-1]
                else:
                    while True:
                        i = temp.read(1)
                        if not i or i == '\n':
                            break
            skip_line = False
            for skipchar in skipchars:
                if line.startswith(skipchar):
                    skip_line = True
                    break
            if not skip_line:
                lines.append(line)
                count += 1
    finally:
        temp.close()
    if file_type == 'binary':
        text = "%s file" % file_type
    else:
        try:
            text = util.unicodify('\n'.join(lines))
        except UnicodeDecodeError:
            text = "binary/unknown file"
    return text
Exemple #20
0
def files_diff(file1, file2, attributes=None):
    """Check the contents of 2 files for differences."""
    def get_lines_diff(diff):
        count = 0
        for line in diff:
            if (line.startswith('+') and not line.startswith('+++')) or (line.startswith('-') and not line.startswith('---')):
                count += 1
        return count

    if not filecmp.cmp(file1, file2, shallow=False):
        if attributes is None:
            attributes = {}
        decompress = attributes.get("decompress", None)
        if decompress:
            # None means all compressed formats are allowed
            compressed_formats = None
        else:
            compressed_formats = []
        is_pdf = False
        try:
            with get_fileobj(file2, compressed_formats=compressed_formats) as fh:
                history_data = fh.readlines()
            with get_fileobj(file1, compressed_formats=compressed_formats) as fh:
                local_file = fh.readlines()
        except UnicodeDecodeError:
            if file1.endswith('.pdf') or file2.endswith('.pdf'):
                is_pdf = True
                # Replace non-Unicode characters using unicodify(),
                # difflib.unified_diff doesn't work on list of bytes
                history_data = [unicodify(l) for l in get_fileobj(file2, mode='rb', compressed_formats=compressed_formats)]
                local_file = [unicodify(l) for l in get_fileobj(file1, mode='rb', compressed_formats=compressed_formats)]
            else:
                raise AssertionError("Binary data detected, not displaying diff")
        if attributes.get('sort', False):
            local_file.sort()
            history_data.sort()
        allowed_diff_count = int(attributes.get('lines_diff', 0))
        diff = list(difflib.unified_diff(local_file, history_data, "local_file", "history_data"))
        diff_lines = get_lines_diff(diff)
        if diff_lines > allowed_diff_count:
            if 'GALAXY_TEST_RAW_DIFF' in os.environ:
                diff_slice = diff
            else:
                if len(diff) < 60:
                    diff_slice = diff[0:40]
                else:
                    diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:]
            # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff
            # due to unknown desired behavior when used in conjunction with a non-zero lines_diff
            # PDF forgiveness can probably be handled better by not special casing by __extension__ here
            # and instead using lines_diff or a regular expression matching
            # or by creating and using a specialized pdf comparison function
            if is_pdf:
                # PDF files contain creation dates, modification dates, ids and descriptions that change with each
                # new file, so we need to handle these differences.  As long as the rest of the PDF file does
                # not differ we're ok.
                valid_diff_strs = ['description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator']
                valid_diff = False
                invalid_diff_lines = 0
                for line in diff_slice:
                    # Make sure to lower case strings before checking.
                    line = line.lower()
                    # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n'
                    if (line.startswith('+') or line.startswith('-')) and line.find('local_file') < 0 and line.find('history_data') < 0:
                        for vdf in valid_diff_strs:
                            if line.find(vdf) < 0:
                                valid_diff = False
                            else:
                                valid_diff = True
                                # Stop checking as soon as we know we have a valid difference
                                break
                        if not valid_diff:
                            invalid_diff_lines += 1
                log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d, found pdf invalid diff = %d" % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines))
                if invalid_diff_lines > allowed_diff_count:
                    # Print out diff_slice so we can see what failed
                    log.info("###### diff_slice ######")
                    raise AssertionError("".join(diff_slice))
            else:
                log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d" % (file1, file2, allowed_diff_count, diff_lines))
                raise AssertionError("".join(diff_slice))
Exemple #21
0
def files_diff(file1, file2, attributes=None):
    """Check the contents of 2 files for differences."""
    def get_lines_diff( diff ):
        count = 0
        for line in diff:
            if ( line.startswith( '+' ) and not line.startswith( '+++' ) ) or ( line.startswith( '-' ) and not line.startswith( '---' ) ):
                count += 1
        return count
    if not filecmp.cmp( file1, file2 ):
        files_differ = False
        if attributes is None:
            attributes = {}
        decompress = attributes.get("decompress", None)
        if not decompress:
            local_file = open( file1, 'U' ).readlines()
            history_data = open( file2, 'U' ).readlines()
        else:
            local_file = get_fileobj( file1, 'U' ).readlines()
            history_data = get_fileobj( file2, 'U' ).readlines()
        if attributes.get( 'sort', False ):
            history_data.sort()
        # Why even bother with the check loop below, why not just use the diff output? This seems wasteful.
        if len( local_file ) == len( history_data ):
            for i in range( len( history_data ) ):
                if local_file[i].rstrip( '\r\n' ) != history_data[i].rstrip( '\r\n' ):
                    files_differ = True
                    break
        else:
            files_differ = True
        if files_differ:
            allowed_diff_count = int(attributes.get( 'lines_diff', 0 ))
            diff = list( difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) )
            diff_lines = get_lines_diff( diff )
            if diff_lines > allowed_diff_count:
                if 'GALAXY_TEST_RAW_DIFF' in os.environ:
                    diff_slice = diff
                else:
                    if len(diff) < 60:
                        diff_slice = diff[0:40]
                    else:
                        diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:]
                # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff
                # due to unknown desired behavior when used in conjunction with a non-zero lines_diff
                # PDF forgiveness can probably be handled better by not special casing by __extension__ here
                # and instead using lines_diff or a regular expression matching
                # or by creating and using a specialized pdf comparison function
                if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ):
                    # PDF files contain creation dates, modification dates, ids and descriptions that change with each
                    # new file, so we need to handle these differences.  As long as the rest of the PDF file does
                    # not differ we're ok.
                    valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator' ]
                    valid_diff = False
                    invalid_diff_lines = 0
                    for line in diff_slice:
                        # Make sure to lower case strings before checking.
                        line = line.lower()
                        # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n'
                        if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0:
                            for vdf in valid_diff_strs:
                                if line.find( vdf ) < 0:
                                    valid_diff = False
                                else:
                                    valid_diff = True
                                    # Stop checking as soon as we know we have a valid difference
                                    break
                            if not valid_diff:
                                invalid_diff_lines += 1
                    log.info('## files diff on %s and %s lines_diff=%d, found diff = %d, found pdf invalid diff = %d' % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines))
                    if invalid_diff_lines > allowed_diff_count:
                        # Print out diff_slice so we can see what failed
                        log.info("###### diff_slice ######")
                        raise AssertionError( "".join( diff_slice ) )
                else:
                    log.info('## files diff on %s and %s lines_diff=%d, found diff = %d' % (file1, file2, allowed_diff_count, diff_lines))
                    for line in diff_slice:
                        for char in line:
                            if ord( char ) > 128:
                                raise AssertionError( "Binary data detected, not displaying diff" )
                    raise AssertionError( "".join( diff_slice )  )