def test_unicode_text_lines_replaces_null_bytes_with_space(self):
     test_file = self.get_test_loc(
         'analysis/text-with-trailing-null-bytes.txt')
     result = list(unicode_text_lines(test_file))
     expected_file = self.get_test_loc(
         'analysis/text-with-trailing-null-bytes.txt.expected')
     check_text_lines(result, expected_file, regen=False)
 def test_unicode_text_lines_handles_weird_xml_encodings(self):
     test_file = self.get_test_loc(
         'analysis/weird_encoding/easyconf-0.9.0.pom')
     result = list(unicode_text_lines(test_file))
     expected_file = self.get_test_loc(
         'analysis/weird_encoding/easyconf-0.9.0.pom.expected')
     check_text_lines(result, expected_file)
 def test_unicode_text_lines_handles_weird_xml_encodings(self):
     test_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom')
     result = list(unicode_text_lines(test_file))
     expected_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom.expected')
     with open(expected_file, 'rb') as tf:
         expected = cPickle.load(tf)
     assert expected == result
Exemple #4
0
def demarkup(location):
    """
    Return an iterator of unicode text lines for the file at `location` lightly
    stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
    etc. The whitespaces are collapsed to one space.
    """
    from textcode.analysis import unicode_text_lines

    # keep the opening tag name of certain tags that contains these strings
    # note: <s> are from debian copyright files
    kept_tags = ('lic', 'copy', 'auth', 'contr', 'leg', 'inc', '@', '<s>',
                 '</s>')

    # find start and closing tags or the first white space whichever comes first
    # or entities
    # this regex is such that ' '.join(tags.split(a))==a

    tags_ents = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href)',
                           re.IGNORECASE).split

    for line in unicode_text_lines(location):
        cleaned = []
        for token in tags_ents(line):
            if token.lower().startswith(
                ('<', '&', 'href')) and not any(k in token.lower()
                                                for k in kept_tags):
                continue
            else:
                cleaned.append(token)
        yield u' '.join(cleaned)
def cpp_includes(location, **kwargs):
    """Collect the #includes statements in a C/C++ file."""
    T = contenttype.get_type(location)
    if not T.is_c_source:
        return
    results = []
    for line in analysis.unicode_text_lines(location):
        for inc in cpp_includes_re().findall(line):
            results.append(inc)
    return dict(cpp_includes=results)
Exemple #6
0
def demarkup(location):
    """
    Return an iterator of unicode text lines for the file at `location` lightly
    stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
    etc. The whitespaces are collapsed to one space.
    """
    from textcode.analysis import unicode_text_lines

    for line in unicode_text_lines(location):
        yield demarkup_text(line)
 def test_unicode_text_lines_handles_weird_xml_encodings(self):
     test_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom')
     result = list(unicode_text_lines(test_file))
     expected_file = self.get_test_loc('analysis/weird_encoding/easyconf-0.9.0.pom.expected')
     regen = False
     if regen:
         with open(expected_file, 'wb') as tf:
             json.dump(result, tf)
     with open(expected_file, 'rb') as tf:
         expected = json.load(tf)
     assert expected == result
Exemple #8
0
def get_tokens(location, template):
    """
    Return a list of tokens from a from a file at location using the tokenizer
    function.
    """
    location = os.path.abspath(location)
    if not exists(location):
        return []

    tokenizr = template_tknzr if template else text_tknzr
    lines = analysis.unicode_text_lines(location)
    return list(tokenizr(lines))
def get_tokens(location, template):
    """
    Return a list of tokens from a from a file at location using the tokenizer
    function.
    """
    location = os.path.abspath(location)
    if not exists(location):
        return []

    tokenizr = template_tknzr if template else text_tknzr
    lines = analysis.unicode_text_lines(location)
    return list(tokenizr(lines))
Exemple #10
0
    def __init__(self, lockfile, print_errors=True):
        self.lockfile = lockfile
        self.print_errors = print_errors
        # map of a line start string to the next parsing state function
        self.STATES = {
            DEPENDENCIES: self.parse_dependency,
            PLATFORMS: self.parse_platform,
            GIT: self.parse_options,
            PATH: self.parse_options,
            SVN: self.parse_options,
            GEM: self.parse_options,
            SPECS: self.parse_spec
        }

        # the final tree of dependencies, keyed by name
        self.dependencies = OrderedDict()

        # a flat dict of all gems, keyed by name
        self.all_gems = OrderedDict()

        self.platforms = []

        self.sources = []
        self.specs = {}

        # init parsing state
        self.reset_state()

        # parse proper
        for line in analysis.unicode_text_lines(lockfile):
            line = line.rstrip()

            # reset state
            if not line:
                self.reset_state()
                continue

            # switch to new state
            if line in self.STATES:
                if line in GEM_TYPES:
                    self.current_type = line
                self.state = self.STATES[line]
                continue

            # process state
            if self.state:
                self.state(line)

        # finally refine the collected data
        self.refine()
def get_tokens(location, template, use_cache=False):
    """
    Return a list of tokens from a from a file at location using the tokenizer
    function.
    """
    location = os.path.abspath(location)
    if not exists(location):
        raise RuleWithNoTokenError('Rule text location does not exist: %(location)r' % locals())
#        return []

    file_name = fileutils.file_name(location)
    cached_tokens = os.path.join(cache_dir, file_name)
    if use_cache and os.path.exists(cached_tokens):
        # TODO: improve cache check
        tokens = list(load_tokens(cached_tokens))
    else:
        tokenizr = template and template_tknzr or text_tknzr
        lines = analysis.unicode_text_lines(location)
        tokens = list(tokenizr(lines))
        if use_cache:
            dump_tokens(cached_tokens, tokens)
    return tokens