def files_re_match(file1, file2, attributes=None): """Check the contents of 2 files for differences using re.match.""" attributes = attributes or {} join_char = '' to_strip = os.linesep compressed_formats = get_compressed_formats(attributes) try: with get_fileobj(file2, compressed_formats=compressed_formats) as fh: history_data = fh.readlines() with get_fileobj(file1, compressed_formats=compressed_formats) as fh: local_file = fh.readlines() except UnicodeDecodeError: join_char = b'' to_strip = os.linesep.encode('utf-8') with open(file2, 'rb') as fh: history_data = fh.readlines() with open(file1, 'rb') as fh: local_file = fh.readlines() assert len(local_file) == len(history_data), 'Data File and Regular Expression File contain a different number of lines (%d != %d)\nHistory Data (first 40 lines):\n%s' % (len(local_file), len(history_data), join_char.join(history_data[:40])) if attributes.get('sort', False): history_data.sort() lines_diff = int(attributes.get('lines_diff', 0)) line_diff_count = 0 diffs = [] for regex_line, data_line in zip(local_file, history_data): regex_line = regex_line.rstrip(to_strip) data_line = data_line.rstrip(to_strip) if not re.match(regex_line, data_line): line_diff_count += 1 diffs.append(f'Regular Expression: {regex_line}, Data file: {data_line}\n') if line_diff_count > lines_diff: raise AssertionError("Regular expression did not match data file (allowed variants=%i):\n%s" % (lines_diff, "".join(diffs)))
def files_contains(file1, file2, attributes=None): """Check the contents of file2 for substrings found in file1, on a per-line basis.""" # TODO: allow forcing ordering of contains attributes = attributes or {} to_strip = os.linesep compressed_formats = get_compressed_formats(attributes) try: with get_fileobj(file2, compressed_formats=compressed_formats) as fh: history_data = fh.read() with get_fileobj(file1, compressed_formats=compressed_formats) as fh: local_file = fh.readlines() except UnicodeDecodeError: to_strip = os.linesep.encode('utf-8') with open(file2, 'rb') as fh: history_data = fh.read() with open(file1, 'rb') as fh: local_file = fh.readlines() lines_diff = int(attributes.get('lines_diff', 0)) line_diff_count = 0 for contains in local_file: contains = contains.rstrip(to_strip) if contains not in history_data: line_diff_count += 1 if line_diff_count > lines_diff: raise AssertionError(f"Failed to find '{contains}' in history data. (lines_diff={lines_diff}).")
def get_headers(fname, sep, count=60, is_multi_byte=False, comment_designator=None): """ Returns a list with the first 'count' lines split by 'sep', ignoring lines starting with 'comment_designator' >>> fname = get_test_fname('complete.bed') >>> get_headers(fname,'\\t') [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']] >>> fname = get_test_fname('test.gff') >>> get_headers(fname, '\\t', count=5, comment_designator='#') [[''], ['chr7', 'bed2gff', 'AR', '26731313', '26731437', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731491', '26731536', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731541', '26731649', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731659', '26731841', '.', '+', '.', 'score']] """ headers = [] with compression_utils.get_fileobj(fname) as in_file: idx = 0 for line in in_file: line = line.rstrip('\n\r') if is_multi_byte: # TODO: fix this - sep is never found in line line = unicodify(line, 'utf-8') sep = sep.encode('utf-8') if comment_designator is not None and comment_designator != '': comment_designator = comment_designator.encode('utf-8') if comment_designator is not None and comment_designator != '' and line.startswith( comment_designator): continue headers.append(line.split(sep)) idx += 1 if idx == count: break return headers
def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True ): """ Returns the first LINE_COUNT lines wrapped to WIDTH >>> fname = get_test_fname('4.bed') >>> get_file_peek(fname, LINE_COUNT=1) u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n' """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 if skipchars is None: skipchars = [] lines = [] count = 0 file_type = None data_checked = False temp = compression_utils.get_fileobj( file_name, "U" ) try: while count < LINE_COUNT: line = temp.readline( WIDTH ) if line and not is_multi_byte and not data_checked: # See if we have a compressed or binary file for char in line: if ord( char ) > 128: file_type = 'binary' break data_checked = True if file_type == 'binary': break if not line_wrap: if line.endswith('\n'): line = line[:-1] else: while True: i = temp.read(1) if not i or i == '\n': break skip_line = False for skipchar in skipchars: if line.startswith( skipchar ): skip_line = True break if not skip_line: lines.append( line ) count += 1 finally: temp.close() if file_type == 'binary': text = "%s file" % file_type else: try: text = util.unicodify( '\n'.join( lines ) ) except UnicodeDecodeError: text = "binary/unknown file" return text
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True): """ Returns the first LINE_COUNT lines wrapped to WIDTH. :param is_multi_byte: deprecated :type is_multi_byte: bool >>> def assert_peek_is(file_name, expected, *args, **kwd): ... path = get_test_fname(file_name) ... peek = get_file_peek(path, *args, **kwd) ... assert peek == expected, "%s != %s" % (peek, expected) >>> assert_peek_is('0_nonewline', u'0') >>> assert_peek_is('0.txt', u'0\\n') >>> assert_peek_is('4.bed', u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n', LINE_COUNT=1) >>> assert_peek_is('1.bed', u'chr1\\t147962192\\t147962580\\tCCDS989.1_cds_0_0_chr1_147962193_r\\t0\\t-\\nchr1\\t147984545\\t147984630\\tCCDS990.1_cds_0_0_chr1_147984546_f\\t0\\t+\\n', LINE_COUNT=2) """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 if skipchars is None: skipchars = [] lines = [] count = 0 last_line_break = False with compression_utils.get_fileobj(file_name, "U") as temp: while count < LINE_COUNT: try: line = temp.readline(WIDTH) except UnicodeDecodeError: return "binary file" if line == "": break last_line_break = False if line.endswith('\n'): line = line[:-1] last_line_break = True elif not line_wrap: while True: i = temp.read(1) if i == '\n': last_line_break = True if not i or i == '\n': break skip_line = False for skipchar in skipchars: if line.startswith(skipchar): skip_line = True break if not skip_line: lines.append(line) count += 1 return '\n'.join(lines) + ('\n' if last_line_break else '')
def iter_headers(fname, sep, count=60, comment_designator=None): with compression_utils.get_fileobj(fname) as in_file: idx = 0 for line in in_file: line = line.rstrip('\n\r') if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator): continue yield line.split(sep) idx += 1 if idx == count: break
def files_re_match_multiline(file1, file2, attributes=None): """Check the contents of 2 files for differences using re.match in multiline mode.""" attributes = attributes or {} join_char = '' compressed_formats = get_compressed_formats(attributes) try: with get_fileobj(file2, compressed_formats=compressed_formats) as fh: history_data = fh.readlines() with get_fileobj(file1, compressed_formats=compressed_formats) as fh: local_file = fh.read() except UnicodeDecodeError: join_char = b'' with open(file2, 'rb') as fh: history_data = fh.readlines() with open(file1, 'rb') as fh: local_file = fh.read() if attributes.get('sort', False): history_data.sort() history_data = join_char.join(history_data) # lines_diff not applicable to multiline matching assert re.match(local_file, history_data, re.MULTILINE), "Multiline Regular expression did not match data file"
def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 with compression_utils.get_fileobj(dataset.file_name) as in_file: for line in in_file: line = line.strip() if line and not line.startswith('#'): data_lines += 1 return data_lines
def estimate_file_lines(self, dataset): """ Perform a rough estimate by extrapolating number of lines from a small read. """ sample_size = 1048576 try: with compression_utils.get_fileobj(dataset.file_name) as dataset_fh: dataset_read = dataset_fh.read(sample_size) sample_lines = dataset_read.count('\n') return int(sample_lines * (float(dataset.get_size()) / float(sample_size))) except UnicodeDecodeError: log.error(f'Unable to estimate lines in file {dataset.file_name}') return None
def iter_headers(fname_or_file_prefix, sep, count=60, comment_designator=None): idx = 0 if isinstance(fname_or_file_prefix, FilePrefix): file_iterator = fname_or_file_prefix.line_iterator() else: file_iterator = compression_utils.get_fileobj(fname_or_file_prefix) for line in file_iterator: line = line.rstrip('\n\r') if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator): continue yield line.split(sep) idx += 1 if idx == count: break
def iter_headers(fname_or_file_prefix, sep, count=60, comment_designator=None): idx = 0 if isinstance(fname_or_file_prefix, FilePrefix): file_iterator = fname_or_file_prefix.line_iterator() else: file_iterator = compression_utils.get_fileobj(fname_or_file_prefix) for line in file_iterator: line = line.rstrip('\n\r') if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator): continue yield line.split(sep) idx += 1 if idx == count: break
def iter_headers(fname, sep, count=60, is_multi_byte=False, comment_designator=None): with compression_utils.get_fileobj(fname) as in_file: idx = 0 for line in in_file: line = line.rstrip('\n\r') if is_multi_byte: # TODO: fix this - sep is never found in line line = unicodify(line, 'utf-8') sep = sep.encode('utf-8') if comment_designator is not None and comment_designator != '': comment_designator = comment_designator.encode('utf-8') if comment_designator is not None and comment_designator != '' and line.startswith(comment_designator): continue yield line.split(sep) idx += 1 if idx == count: break
def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ data_lines = 0 with compression_utils.get_fileobj(dataset.file_name) as in_file: # FIXME: Potential encoding issue can prevent the ability to iterate over lines # causing set_meta process to fail otherwise OK jobs. A better solution than # a silent try/except is desirable. try: for line in in_file: line = line.strip() if line and not line.startswith('#'): data_lines += 1 except Exception: pass return data_lines
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True): """ Returns the first LINE_COUNT lines wrapped to WIDTH. :param is_multi_byte: deprecated :type is_multi_byte: bool >>> fname = get_test_fname('4.bed') >>> get_file_peek(fname, LINE_COUNT=1) u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n' """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 if skipchars is None: skipchars = [] lines = [] count = 0 with compression_utils.get_fileobj(file_name, "U") as temp: while count < LINE_COUNT: try: line = temp.readline(WIDTH) except UnicodeDecodeError: return "binary file" if not line_wrap: if line.endswith('\n'): line = line[:-1] else: while True: i = temp.read(1) if not i or i == '\n': break skip_line = False for skipchar in skipchars: if line.startswith(skipchar): skip_line = True break if not skip_line: lines.append(line) count += 1 return '\n'.join(lines)
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True): """ Returns the first LINE_COUNT lines wrapped to WIDTH. :param is_multi_byte: deprecated :type is_multi_byte: bool >>> fname = get_test_fname('4.bed') >>> get_file_peek(fname, LINE_COUNT=1) u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n' """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 if skipchars is None: skipchars = [] lines = [] count = 0 with compression_utils.get_fileobj(file_name, "U") as temp: while count < LINE_COUNT: try: line = temp.readline(WIDTH) except UnicodeDecodeError: return "binary file" if not line_wrap: if line.endswith('\n'): line = line[:-1] else: while True: i = temp.read(1) if not i or i == '\n': break skip_line = False for skipchar in skipchars: if line.startswith(skipchar): skip_line = True break if not skip_line: lines.append(line) count += 1 return '\n'.join(lines)
def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, skipping all blank lines and comments. """ CHUNK_SIZE = 2 ** 15 # 32Kb data_lines = 0 with compression_utils.get_fileobj(dataset.file_name) as in_file: # FIXME: Potential encoding issue can prevent the ability to iterate over lines # causing set_meta process to fail otherwise OK jobs. A better solution than # a silent try/except is desirable. try: for line in iter_start_of_line(in_file, CHUNK_SIZE): line = line.strip() if line and not line.startswith('#'): data_lines += 1 except UnicodeDecodeError: log.error(f'Unable to count lines in file {dataset.file_name}') return None return data_lines
def iter_headers(fname, sep, count=60, is_multi_byte=False, comment_designator=None): with compression_utils.get_fileobj(fname) as in_file: idx = 0 for line in in_file: line = line.rstrip('\n\r') if is_multi_byte: # TODO: fix this - sep is never found in line line = unicodify(line, 'utf-8') sep = sep.encode('utf-8') if comment_designator is not None and comment_designator != '': comment_designator = comment_designator.encode('utf-8') if comment_designator is not None and comment_designator != '' and line.startswith( comment_designator): continue yield line.split(sep) idx += 1 if idx == count: break
def get_headers( fname, sep, count=60, is_multi_byte=False ): """ Returns a list with the first 'count' lines split by 'sep' >>> fname = get_test_fname('complete.bed') >>> get_headers(fname,'\\t') [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']] """ headers = [] in_file = compression_utils.get_fileobj(fname) try: for idx, line in enumerate(in_file): line = line.rstrip('\n\r') if is_multi_byte: # TODO: fix this - sep is never found in line line = unicodify( line, 'utf-8' ) sep = sep.encode( 'utf-8' ) headers.append( line.split(sep) ) if idx == count: break finally: in_file.close() return headers
def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True): """ Returns the first LINE_COUNT lines wrapped to WIDTH >>> fname = get_test_fname('4.bed') >>> get_file_peek(fname, LINE_COUNT=1) u'chr22\\t30128507\\t31828507\\tuc003bnx.1_cds_2_0_chr22_29227_f\\t0\\t+\\n' """ # Set size for file.readline() to a negative number to force it to # read until either a newline or EOF. Needed for datasets with very # long lines. if WIDTH == 'unlimited': WIDTH = -1 if skipchars is None: skipchars = [] lines = [] count = 0 file_type = None data_checked = False temp = compression_utils.get_fileobj(file_name, "U") try: while count < LINE_COUNT: line = temp.readline(WIDTH) if line and not is_multi_byte and not data_checked: # See if we have a compressed or binary file for char in line: if ord(char) > 128: file_type = 'binary' break data_checked = True if file_type == 'binary': break if not line_wrap: if line.endswith('\n'): line = line[:-1] else: while True: i = temp.read(1) if not i or i == '\n': break skip_line = False for skipchar in skipchars: if line.startswith(skipchar): skip_line = True break if not skip_line: lines.append(line) count += 1 finally: temp.close() if file_type == 'binary': text = "%s file" % file_type else: try: text = util.unicodify('\n'.join(lines)) except UnicodeDecodeError: text = "binary/unknown file" return text
def files_diff(file1, file2, attributes=None): """Check the contents of 2 files for differences.""" def get_lines_diff(diff): count = 0 for line in diff: if (line.startswith('+') and not line.startswith('+++')) or (line.startswith('-') and not line.startswith('---')): count += 1 return count if not filecmp.cmp(file1, file2, shallow=False): if attributes is None: attributes = {} decompress = attributes.get("decompress", None) if decompress: # None means all compressed formats are allowed compressed_formats = None else: compressed_formats = [] is_pdf = False try: with get_fileobj(file2, compressed_formats=compressed_formats) as fh: history_data = fh.readlines() with get_fileobj(file1, compressed_formats=compressed_formats) as fh: local_file = fh.readlines() except UnicodeDecodeError: if file1.endswith('.pdf') or file2.endswith('.pdf'): is_pdf = True # Replace non-Unicode characters using unicodify(), # difflib.unified_diff doesn't work on list of bytes history_data = [unicodify(l) for l in get_fileobj(file2, mode='rb', compressed_formats=compressed_formats)] local_file = [unicodify(l) for l in get_fileobj(file1, mode='rb', compressed_formats=compressed_formats)] else: raise AssertionError("Binary data detected, not displaying diff") if attributes.get('sort', False): local_file.sort() history_data.sort() allowed_diff_count = int(attributes.get('lines_diff', 0)) diff = list(difflib.unified_diff(local_file, history_data, "local_file", "history_data")) diff_lines = get_lines_diff(diff) if diff_lines > allowed_diff_count: if 'GALAXY_TEST_RAW_DIFF' in os.environ: diff_slice = diff else: if len(diff) < 60: diff_slice = diff[0:40] else: diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:] # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff # due to unknown desired behavior when used in conjunction with a non-zero lines_diff # PDF forgiveness can probably be handled better by not special casing by __extension__ here # and instead using lines_diff or a regular expression matching # or by creating and using a specialized pdf comparison function if is_pdf: # PDF files contain creation dates, modification dates, ids and descriptions that change with each # new file, so we need to handle these differences. As long as the rest of the PDF file does # not differ we're ok. valid_diff_strs = ['description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator'] valid_diff = False invalid_diff_lines = 0 for line in diff_slice: # Make sure to lower case strings before checking. line = line.lower() # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' if (line.startswith('+') or line.startswith('-')) and line.find('local_file') < 0 and line.find('history_data') < 0: for vdf in valid_diff_strs: if line.find(vdf) < 0: valid_diff = False else: valid_diff = True # Stop checking as soon as we know we have a valid difference break if not valid_diff: invalid_diff_lines += 1 log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d, found pdf invalid diff = %d" % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines)) if invalid_diff_lines > allowed_diff_count: # Print out diff_slice so we can see what failed log.info("###### diff_slice ######") raise AssertionError("".join(diff_slice)) else: log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d" % (file1, file2, allowed_diff_count, diff_lines)) raise AssertionError("".join(diff_slice))
def files_diff(file1, file2, attributes=None): """Check the contents of 2 files for differences.""" def get_lines_diff( diff ): count = 0 for line in diff: if ( line.startswith( '+' ) and not line.startswith( '+++' ) ) or ( line.startswith( '-' ) and not line.startswith( '---' ) ): count += 1 return count if not filecmp.cmp( file1, file2 ): files_differ = False if attributes is None: attributes = {} decompress = attributes.get("decompress", None) if not decompress: local_file = open( file1, 'U' ).readlines() history_data = open( file2, 'U' ).readlines() else: local_file = get_fileobj( file1, 'U' ).readlines() history_data = get_fileobj( file2, 'U' ).readlines() if attributes.get( 'sort', False ): history_data.sort() # Why even bother with the check loop below, why not just use the diff output? This seems wasteful. if len( local_file ) == len( history_data ): for i in range( len( history_data ) ): if local_file[i].rstrip( '\r\n' ) != history_data[i].rstrip( '\r\n' ): files_differ = True break else: files_differ = True if files_differ: allowed_diff_count = int(attributes.get( 'lines_diff', 0 )) diff = list( difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) ) diff_lines = get_lines_diff( diff ) if diff_lines > allowed_diff_count: if 'GALAXY_TEST_RAW_DIFF' in os.environ: diff_slice = diff else: if len(diff) < 60: diff_slice = diff[0:40] else: diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:] # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff # due to unknown desired behavior when used in conjunction with a non-zero lines_diff # PDF forgiveness can probably be handled better by not special casing by __extension__ here # and instead using lines_diff or a regular expression matching # or by creating and using a specialized pdf comparison function if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ): # PDF files contain creation dates, modification dates, ids and descriptions that change with each # new file, so we need to handle these differences. As long as the rest of the PDF file does # not differ we're ok. valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator' ] valid_diff = False invalid_diff_lines = 0 for line in diff_slice: # Make sure to lower case strings before checking. line = line.lower() # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0: for vdf in valid_diff_strs: if line.find( vdf ) < 0: valid_diff = False else: valid_diff = True # Stop checking as soon as we know we have a valid difference break if not valid_diff: invalid_diff_lines += 1 log.info('## files diff on %s and %s lines_diff=%d, found diff = %d, found pdf invalid diff = %d' % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines)) if invalid_diff_lines > allowed_diff_count: # Print out diff_slice so we can see what failed log.info("###### diff_slice ######") raise AssertionError( "".join( diff_slice ) ) else: log.info('## files diff on %s and %s lines_diff=%d, found diff = %d' % (file1, file2, allowed_diff_count, diff_lines)) for line in diff_slice: for char in line: if ord( char ) > 128: raise AssertionError( "Binary data detected, not displaying diff" ) raise AssertionError( "".join( diff_slice ) )