def test_multiline_comment(self): """Multi-line spans should close at the end of one line and reopen at the beginning of the next.""" c = Region('c') c2 = Region('c') l = LINE tags = [(0, True, c), (79, False, c), (80, False, l), (80, True, c2), (151, False, l), (222, False, l), (284, False, c2), (285, False, l), (286, False, l)] text = u"""/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ """ lines = split_content_lines(text) offsets = build_offset_map(lines) actual_lines = [ html_line(text_line.rstrip('\r\n'), e, offset) for text_line, e, offset in zip( lines, tags_per_line(balanced_tags(tags)), offsets) ] expected_lines = [ '<span class="c">/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */</span>', '<span class="c">/* This Source Code Form is subject to the terms of the Mozilla Public</span>', '<span class="c"> * License, v. 2.0. If a copy of the MPL was not distributed with this</span>', '<span class="c"> * file, You can obtain one at http://mozilla.org/MPL/2.0/. */</span>', '' ] eq_(actual_lines, expected_lines)
def rev(tree, revision, path): """Display a page showing the file at path at specified revision by obtaining the contents from version control. """ config = current_app.dxr_config tree_config = config.trees[tree] abs_path = join(tree_config.source_folder, path) contents = file_contents_at_rev(abs_path, revision) if contents is not None: image_rev = None if is_binary_image(path): is_text = False contents = '' image_rev = revision else: is_text, contents = decode_data(contents, tree_config.source_encoding) if not is_text: contents = '' elif is_textual_image(path): image_rev = revision # We do some wrapping to mimic the JSON returned by an ES lines query. return _browse_file(tree, path, [{'content': line} for line in split_content_lines(contents)], {}, config, not is_text, contents=contents, image_rev=image_rev) else: raise NotFound
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = split_content_lines(text) offsets = build_offset_map(lines) return [html_line(text_line, e, o) for (text_line, e, o) in zip(lines, tags_per_line(finished_tags(lines, refs, regions)), offsets)]
def test_unusual_whitespace(): """Ensure that vertical tabs and form feeds are treated as ordinary whitespace and not as line endings""" lines = [ u"This contains 3 lines\n", u"This line has a vertical tab \v and a form feed \f in it\n", u"This is the last line\n" ] eq_(split_content_lines(u''.join(lines)), lines)
def test_line_boundaries(): """Make sure we find the correct line boundaries with all sorts of line endings, even in files that don't end with a newline.""" eq_(list((point, is_start) for point, is_start, _ in line_boundaries(split_content_lines('abc\ndef\r\nghi\rjkl'))), [(4, False), (9, False), (13, False), (16, False)])
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = split_content_lines(text) offsets = build_offset_map(lines) return [ html_line(text_line, e, o) for (text_line, e, o) in zip( lines, tags_per_line(finished_tags(lines, refs, regions)), offsets) ]
def test_line_boundaries(): """Make sure we find the correct line boundaries with all sorts of line endings, even in files that don't end with a newline.""" eq_( list((point, is_start) for point, is_start, _ in line_boundaries( split_content_lines('abc\ndef\r\nghi\rjkl'))), [(4, False), (9, False), (13, False), (16, False)])
def _line_offsets(self): """Return (and cache) a list mapping 1-based line numbers to from-BOF Unicode offsets.""" if not hasattr(self, '_line_offset_list'): if not self.contains_text(): raise ValueError("Can't get line offsets for a file that isn't" " text.") lines = split_content_lines(self.contents) if self.contents is not None else [] self._line_offset_list = build_offset_map(lines) return self._line_offset_list
def _line_offsets(self): """Return (and cache) a list mapping 1-based line numbers to from-BOF Unicode offsets.""" if not hasattr(self, '_line_offset_list'): if not self.contains_text(): raise ValueError("Can't get line offsets for a file that isn't" " text.") lines = split_content_lines( self.contents) if self.contents is not None else [] self._line_offset_list = build_offset_map(lines) return self._line_offset_list
def ast_parse(contents): """Return the abstract syntax parse tree of some Python file contents, stripped of the encoding cookie, if any. Solves a problem where compiling a unicode string with an encoding declaration is a SyntaxError in Python 2 (issue #22221). """ return ast.parse(u''.join( # The encoding declaration is only meaningful in the top two lines. u'\n' if i < 2 and encoding_re.match(line) else line for i, line in enumerate(split_content_lines(contents))))
def idl(self): """Parse the IDL file and resolve dependencies. If successful, return an IdlVisitor object which has visited the AST and has refs and needles ready for ES. Otherwise, on exception, return None.""" # Don't try again if we already excepted. if not self._idl and not self._had_idl_exception: try: self._idl = IdlVisitor(self.parser, self.contents, split_content_lines(self.contents), self.path, self.absolute_path(), self.plugin_config.include_folders, self.plugin_config.header_path, self.tree) except IDLError: self._had_idl_exception = True return self._idl
def ast_parse(contents): """Return the abstract syntax parse tree of some Python file contents, stripped of the encoding cookie, if any. Solves a problem where compiling a unicode string with an encoding declaration is a SyntaxError in Python 2 (issue #22221). """ return ast.parse( u''.join( # The encoding declaration is only meaningful in the top two lines. u'\n' if i < 2 and encoding_re.match(line) else line for i, line in enumerate(split_content_lines(contents)) ) )
def test_multiline_comment(self): """Multi-line spans should close at the end of one line and reopen at the beginning of the next.""" c = Region('c') c2 = Region('c') l = LINE tags = [(0, True, c), (79, False, c), (80, False, l), (80, True, c2), (151, False, l), (222, False, l), (284, False, c2), (285, False, l), (286, False, l)] text = u"""/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ """ lines = split_content_lines(text) offsets = build_offset_map(lines) actual_lines = [html_line(text_line.rstrip('\r\n'), e, offset) for text_line, e, offset in zip(lines, tags_per_line(balanced_tags(tags)), offsets)] expected_lines = ['<span class="c">/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */</span>', '<span class="c">/* This Source Code Form is subject to the terms of the Mozilla Public</span>', '<span class="c"> * License, v. 2.0. If a copy of the MPL was not distributed with this</span>', '<span class="c"> * file, You can obtain one at http://mozilla.org/MPL/2.0/. */</span>', ''] eq_(actual_lines, expected_lines)
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Bytestring absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict( # Some non-array fields: folder=unicode_for_display(folder_name), name=unicode_for_display(file_name), size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines( finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket( tags, lambda index_obj: "regions" if isinstance( index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)
def needles_by_line(self): """Fill out line number and content for every line.""" for number, text in enumerate(split_content_lines(self.contents), 1): yield [('number', number), ('content', text)]