def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear()
def docs(): """Yield documents for bulk indexing.""" # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict( # Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = [{ 'order': order, 'heading': heading, 'items': [{ 'icon': icon, 'title': title, 'href': href } for icon, title, href in items] } for order, heading, items in chain.from_iterable(linkses)] if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines( finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket( tags, lambda index_obj: "regions" if isinstance( index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total)
def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear()
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = split_content_lines(text) offsets = build_offset_map(lines) return [ html_line(text_line, e, o) for (text_line, e, o) in zip( lines, tags_per_line(finished_tags(lines, refs, regions)), offsets) ]
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = text.splitlines(True) offsets = cumulative_sum(map(len, lines)) return [html_line(text_line, e, o) for (text_line, e, o) in zip(lines, tags_per_line(finished_tags(lines, refs, regions)), offsets)]
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = text.splitlines(True) offsets = cumulative_sum(map(len, lines)) return [ html_line(text_line, e, o) for (text_line, e, o) in zip( lines, tags_per_line(finished_tags(lines, refs, regions)), offsets) ]
def docs(): """Yield documents for bulk indexing.""" # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = [{'order': order, 'heading': heading, 'items': [{'icon': icon, 'title': title, 'href': href} for icon, title, href in items]} for order, heading, items in chain.from_iterable(linkses)] if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. If it's an empty file (no lines), don't bother # ES. It hates empty dicts. if is_text and needles_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total)
def text_to_html_lines(text, refs=(), regions=()): """Run the full pipeline, and return a list of htmlified lines of ``text`` with markup interspersed for ``regions``.""" lines = split_content_lines(text) offsets = build_offset_map(lines) return [html_line(text_line, e, o) for (text_line, e, o) in zip(lines, tags_per_line(finished_tags(lines, refs, regions)), offsets)]
def _browse_file(tree, path, line_docs, file_doc, config, is_binary, date=None, contents=None, image_rev=None): """Return a rendered page displaying a source file. :arg string tree: name of tree on which file is found :arg string path: relative path from tree root of file :arg list line_docs: LINE documents as defined in the mapping of core.py, where the `content` field is dereferenced :arg file_doc: the FILE document as defined in core.py :arg config: TreeConfig object of this tree :arg is_binary: Whether file is binary or not :arg date: a formatted string representing the generated date, default to now :arg string contents: the contents of the source file, defaults to joining the `content` field of all line_docs :arg image_rev: revision number of a textual or binary image, for images displayed at a certain rev """ def process_link_templates(sections): """Look for {{line}} in the links of given sections, and duplicate them onto a 'template' field. """ for section in sections: for link in section['items']: if '{{line}}' in link['href']: link['template'] = link['href'] link['href'] = link['href'].replace('{{line}}', '') def sidebar_links(sections): """Return data structure to build nav sidebar from. :: [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])] """ process_link_templates(sections) # Sort by order, resolving ties by section name: return sorted(sections, key=lambda section: (section['order'], section['heading'])) if not date: # Then assume that the file is generated now. Remark: we can't use this # as the default param because that is only evaluated once, so the same # time would always be used. date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") common = _build_common_file_template(tree, path, is_binary, date, config) links = file_doc.get('links', []) if is_binary_image(path): return render_template( 'image_file.html', **merge(common, { 'sections': sidebar_links(links), 'revision': image_rev})) elif is_binary: return render_template( 'text_file.html', **merge(common, { 'lines': [], 'is_binary': True, 'sections': sidebar_links(links)})) else: # We concretize the lines into a list because we iterate over it multiple times lines = [doc['content'] for doc in line_docs] if not contents: # If contents are not provided, we can reconstruct them by # stitching the lines together. contents = ''.join(lines) offsets = build_offset_map(lines) tree_config = config.trees[tree] if is_textual_image(path) and image_rev: # Add a link to view textual images on revs: links.extend(dictify_links([ (4, 'Image', [('svgview', 'View', url_for('.raw_rev', tree=tree_config.name, path=path, revision=image_rev))])])) # Construct skimmer objects for all enabled plugins that define a # file_to_skim class. skimmers = [plugin.file_to_skim(path, contents, plugin.name, tree_config, file_doc, line_docs) for plugin in tree_config.enabled_plugins if plugin.file_to_skim] skim_links, refses, regionses, annotationses = skim_file(skimmers, len(line_docs)) index_refs = (Ref.es_to_triple(ref, tree_config) for ref in chain.from_iterable(doc.get('refs', []) for doc in line_docs)) index_regions = (Region.es_to_triple(region) for region in chain.from_iterable(doc.get('regions', []) for doc in line_docs)) tags = finished_tags(lines, chain(chain.from_iterable(refses), index_refs), chain(chain.from_iterable(regionses), index_regions)) return render_template( 'text_file.html', **merge(common, { # Someday, it would be great to stream this and not concretize # the whole thing in RAM. The template will have to quit # looping through the whole thing 3 times. 'lines': [(html_line(doc['content'], tags_in_line, offset), doc.get('annotations', []) + skim_annotations) for doc, tags_in_line, offset, skim_annotations in izip(line_docs, tags_per_line(tags), offsets, annotationses)], 'sections': sidebar_links(links + skim_links), 'query': request.args.get('q', ''), 'bubble': request.args.get('redirect_type')}))
def _browse_file(tree, path, line_docs, file_doc, config, date=None, contents=None): """Return a rendered page displaying a source file. :arg string tree: name of tree on which file is found :arg string path: relative path from tree root of file :arg list line_docs: LINE documents as defined in the mapping of core.py, where the `content` field is dereferenced :arg file_doc: the FILE document as defined in core.py :arg config: TreeConfig object of this tree :arg date: a formatted string representing the generated date, default to now :arg string contents: the contents of the source file, defaults to joining the `content` field of all line_docs """ def sidebar_links(sections): """Return data structure to build nav sidebar from. :: [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])] """ # Sort by order, resolving ties by section name: return sorted(sections, key=lambda section: (section['order'], section['heading'])) if not date: # Then assume that the file is generated now. Remark: we can't use this # as the default param because that is only evaluated once, so the same # time would always be used. date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") common = _build_common_file_template(tree, path, date, config) links = file_doc.get('links', []) if is_binary_image(path): return render_template('image_file.html', **common) else: # We don't allow browsing binary files, so this must be a text file. # We concretize the lines into a list because we iterate over it multiple times lines = [doc['content'] for doc in line_docs] if not contents: # If contents are not provided, we can reconstruct them by # stitching the lines together. contents = ''.join(lines) offsets = cumulative_sum(imap(len, lines)) tree_config = config.trees[tree] # Construct skimmer objects for all enabled plugins that define a # file_to_skim class. skimmers = [ plugin.file_to_skim(path, contents, plugin.name, tree_config, file_doc, line_docs) for plugin in tree_config.enabled_plugins if plugin.file_to_skim ] skim_links, refses, regionses, annotationses = skim_file( skimmers, len(line_docs)) index_refs = (Ref.es_to_triple(ref, tree_config) for ref in chain.from_iterable( doc.get('refs', []) for doc in line_docs)) index_regions = (Region.es_to_triple(region) for region in chain.from_iterable( doc.get('regions', []) for doc in line_docs)) tags = finished_tags( lines, chain(chain.from_iterable(refses), index_refs), chain(chain.from_iterable(regionses), index_regions)) return render_template( 'text_file.html', **merge( common, { # Someday, it would be great to stream this and not concretize # the whole thing in RAM. The template will have to quit # looping through the whole thing 3 times. 'lines': [(html_line(doc['content'], tags_in_line, offset), doc.get('annotations', []) + skim_annotations) for doc, tags_in_line, offset, skim_annotations in izip(line_docs, tags_per_line(tags), offsets, annotationses)], 'is_text': True, 'sections': sidebar_links(links + skim_links) }))
def _browse_file(tree, path, line_docs, file_doc, config, date=None, contents=None): """Return a rendered page displaying a source file. :arg string tree: name of tree on which file is found :arg string path: relative path from tree root of file :arg list line_docs: LINE documents as defined in the mapping of core.py, where the `content` field is dereferenced :arg file_doc: the FILE document as defined in core.py :arg config: TreeConfig object of this tree :arg date: a formatted string representing the generated date, default to now :arg string contents: the contents of the source file, defaults to joining the `content` field of all line_docs """ def sidebar_links(sections): """Return data structure to build nav sidebar from. :: [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])] """ # Sort by order, resolving ties by section name: return sorted(sections, key=lambda section: (section['order'], section['heading'])) if not date: # Then assume that the file is generated now. Remark: we can't use this # as the default param because that is only evaluated once, so the same # time would always be used. date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") common = _build_common_file_template(tree, path, date, config) links = file_doc.get('links', []) if is_image(path): return render_template( 'image_file.html', **common) else: # We don't allow browsing binary files, so this must be a text file. # We concretize the lines into a list because we iterate over it multiple times lines = [doc['content'] for doc in line_docs] if not contents: # If contents are not provided, we can reconstruct them by # stitching the lines together. contents = ''.join(lines) offsets = cumulative_sum(imap(len, lines)) tree_config = config.trees[tree] # Construct skimmer objects for all enabled plugins that define a # file_to_skim class. skimmers = [plugin.file_to_skim(path, contents, plugin.name, tree_config, file_doc, line_docs) for plugin in tree_config.enabled_plugins if plugin.file_to_skim] skim_links, refses, regionses, annotationses = skim_file(skimmers, len(line_docs)) index_refs = (Ref.es_to_triple(ref, tree_config) for ref in chain.from_iterable(doc.get('refs', []) for doc in line_docs)) index_regions = (Region.es_to_triple(region) for region in chain.from_iterable(doc.get('regions', []) for doc in line_docs)) tags = finished_tags(lines, chain(chain.from_iterable(refses), index_refs), chain(chain.from_iterable(regionses), index_regions)) return render_template( 'text_file.html', **merge(common, { # Someday, it would be great to stream this and not concretize # the whole thing in RAM. The template will have to quit # looping through the whole thing 3 times. 'lines': [(html_line(doc['content'], tags_in_line, offset), doc.get('annotations', []) + skim_annotations) for doc, tags_in_line, offset, skim_annotations in izip(line_docs, tags_per_line(tags), offsets, annotationses)], 'is_text': True, 'sections': sidebar_links(links + skim_links)}))