def test_unicode_text_lines_replaces_null_bytes_with_space(self): test_file = self.get_test_loc( 'analysis/text-with-trailing-null-bytes.txt') result = list(unicode_text_lines(test_file)) expected_file = self.get_test_loc( 'analysis/text-with-trailing-null-bytes.txt.expected') check_text_lines(result, expected_file, regen=False)
def test_unicode_text_lines_handles_weird_xml_encodings(self): test_file = self.get_test_loc( 'analysis/weird_encoding/easyconf-0.9.0.pom') result = list(unicode_text_lines(test_file)) expected_file = self.get_test_loc( 'analysis/weird_encoding/easyconf-0.9.0.pom.expected') check_text_lines(result, expected_file)
def test_unicode_text_lines_handles_weird_xml_encodings(self): test_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom') result = list(unicode_text_lines(test_file)) expected_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom.expected') with open(expected_file, 'rb') as tf: expected = cPickle.load(tf) assert expected == result
def demarkup(location): """ Return an iterator of unicode text lines for the file at `location` lightly stripping markup if the file is some kind of markup, such as HTML, XML, PHP, etc. The whitespaces are collapsed to one space. """ from textcode.analysis import unicode_text_lines # keep the opening tag name of certain tags that contains these strings # note: <s> are from debian copyright files kept_tags = ('lic', 'copy', 'auth', 'contr', 'leg', 'inc', '@', '<s>', '</s>') # find start and closing tags or the first white space whichever comes first # or entities # this regex is such that ' '.join(tags.split(a))==a tags_ents = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href)', re.IGNORECASE).split for line in unicode_text_lines(location): cleaned = [] for token in tags_ents(line): if token.lower().startswith( ('<', '&', 'href')) and not any(k in token.lower() for k in kept_tags): continue else: cleaned.append(token) yield u' '.join(cleaned)
def cpp_includes(location, **kwargs): """Collect the #includes statements in a C/C++ file.""" T = contenttype.get_type(location) if not T.is_c_source: return results = [] for line in analysis.unicode_text_lines(location): for inc in cpp_includes_re().findall(line): results.append(inc) return dict(cpp_includes=results)
def demarkup(location): """ Return an iterator of unicode text lines for the file at `location` lightly stripping markup if the file is some kind of markup, such as HTML, XML, PHP, etc. The whitespaces are collapsed to one space. """ from textcode.analysis import unicode_text_lines for line in unicode_text_lines(location): yield demarkup_text(line)
def test_unicode_text_lines_handles_weird_xml_encodings(self): test_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom') result = list(unicode_text_lines(test_file)) expected_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom.expected') regen = False if regen: with open(expected_file, 'wb') as tf: json.dump(result, tf) with open(expected_file, 'rb') as tf: expected = json.load(tf) assert expected == result
def get_tokens(location, template): """ Return a list of tokens from a from a file at location using the tokenizer function. """ location = os.path.abspath(location) if not exists(location): return [] tokenizr = template_tknzr if template else text_tknzr lines = analysis.unicode_text_lines(location) return list(tokenizr(lines))
def __init__(self, lockfile, print_errors=True): self.lockfile = lockfile self.print_errors = print_errors # map of a line start string to the next parsing state function self.STATES = { DEPENDENCIES: self.parse_dependency, PLATFORMS: self.parse_platform, GIT: self.parse_options, PATH: self.parse_options, SVN: self.parse_options, GEM: self.parse_options, SPECS: self.parse_spec } # the final tree of dependencies, keyed by name self.dependencies = OrderedDict() # a flat dict of all gems, keyed by name self.all_gems = OrderedDict() self.platforms = [] self.sources = [] self.specs = {} # init parsing state self.reset_state() # parse proper for line in analysis.unicode_text_lines(lockfile): line = line.rstrip() # reset state if not line: self.reset_state() continue # switch to new state if line in self.STATES: if line in GEM_TYPES: self.current_type = line self.state = self.STATES[line] continue # process state if self.state: self.state(line) # finally refine the collected data self.refine()
def get_tokens(location, template, use_cache=False): """ Return a list of tokens from a from a file at location using the tokenizer function. """ location = os.path.abspath(location) if not exists(location): raise RuleWithNoTokenError('Rule text location does not exist: %(location)r' % locals()) # return [] file_name = fileutils.file_name(location) cached_tokens = os.path.join(cache_dir, file_name) if use_cache and os.path.exists(cached_tokens): # TODO: improve cache check tokens = list(load_tokens(cached_tokens)) else: tokenizr = template and template_tknzr or text_tknzr lines = analysis.unicode_text_lines(location) tokens = list(tokenizr(lines)) if use_cache: dump_tokens(cached_tokens, tokens) return tokens