def _compare_scanners(py_data, c_data, verbose): py_tokens = list(hughml.scan(py_data, Loader=hughml.PyLoader)) c_tokens = [] try: for token in hughml.scan(c_data, Loader=hughml.CLoader): c_tokens.append(token) assert len(py_tokens) == len(c_tokens), (len(py_tokens), len(c_tokens)) for py_token, c_token in zip(py_tokens, c_tokens): assert py_token.__class__ == c_token.__class__, (py_token, c_token) if hasattr(py_token, 'value'): assert py_token.value == c_token.value, (py_token, c_token) if isinstance(py_token, hughml.StreamEndToken): continue py_start = (py_token.start_mark.index, py_token.start_mark.line, py_token.start_mark.column) py_end = (py_token.end_mark.index, py_token.end_mark.line, py_token.end_mark.column) c_start = (c_token.start_mark.index, c_token.start_mark.line, c_token.start_mark.column) c_end = (c_token.end_mark.index, c_token.end_mark.line, c_token.end_mark.column) assert py_start == c_start, (py_start, c_start) assert py_end == c_end, (py_end, c_end) finally: if verbose: print "PY_TOKENS:" pprint.pprint(py_tokens) print "C_TOKENS:" pprint.pprint(c_tokens)
def test_scanner(data_filename, canonical_filename, verbose=False): for filename in [data_filename, canonical_filename]: tokens = [] try: for token in hughml.scan(open(filename, 'rb')): tokens.append(token.__class__.__name__) finally: if verbose: pprint.pprint(tokens)
def test_tokens(data_filename, tokens_filename, verbose=False): tokens1 = [] tokens2 = open(tokens_filename, 'r').read().split() try: for token in hughml.scan(open(data_filename, 'rb')): if not isinstance( token, (hughml.StreamStartToken, hughml.StreamEndToken)): tokens1.append(_replaces[token.__class__]) finally: if verbose: print("TOKENS1:", ' '.join(tokens1)) print("TOKENS2:", ' '.join(tokens2)) assert len(tokens1) == len(tokens2), (tokens1, tokens2) for token1, token2 in zip(tokens1, tokens2): assert token1 == token2, (token1, token2)
def highlight(self): input = self.input.read() if input.startswith(codecs.BOM_UTF16_LE): input = unicode(input, 'utf-16-le') elif input.startswith(codecs.BOM_UTF16_BE): input = unicode(input, 'utf-16-be') else: input = unicode(input, 'utf-8') substitutions = self.style.substitutions tokens = hughml.scan(input) events = hughml.parse(input) markers = [] number = 0 for token in tokens: number += 1 if token.start_mark.index != token.end_mark.index: cls = token.__class__ if (cls, -1) in substitutions: markers.append([token.start_mark.index, +2, number, substitutions[cls, -1]]) if (cls, +1) in substitutions: markers.append([token.end_mark.index, -2, number, substitutions[cls, +1]]) number = 0 for event in events: number += 1 cls = event.__class__ if (cls, -1) in substitutions: markers.append([event.start_mark.index, +1, number, substitutions[cls, -1]]) if (cls, +1) in substitutions: markers.append([event.end_mark.index, -1, number, substitutions[cls, +1]]) markers.sort() markers.reverse() chunks = [] position = len(input) for index, weight1, weight2, substitution in markers: if index < position: chunk = input[index:position] for substring, replacement in self.style.replaces: chunk = chunk.replace(substring, replacement) chunks.append(chunk) position = index chunks.append(substitution) chunks.reverse() result = u''.join(chunks) if self.style.header: self.output.write(self.style.header) self.output.write(result.encode('utf-8')) if self.style.footer: self.output.write(self.style.footer)
def canonical_scan(stream): return hughml.scan(stream, Loader=CanonicalLoader)