Exemple #1
0
    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()
Exemple #2
0
    def docs():
        """Yield documents for bulk indexing."""
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(  # Some non-array fields:
            folder=folder_name,
            name=file_name,
            size=file_info.st_size,
            modified=datetime.fromtimestamp(file_info.st_mtime),
            is_folder=False,

            # And these, which all get mashed into arrays:
            **needles)
        links = [{
            'order':
            order,
            'heading':
            heading,
            'items': [{
                'icon': icon,
                'title': title,
                'href': href
            } for icon, title, href in items]
        } for order, heading, items in chain.from_iterable(linkses)]
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line, annotations_by_line,
                    es_lines(
                        finished_tags(lines, chain.from_iterable(refses),
                                      chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(
                    tags, lambda index_obj: "regions" if isinstance(
                        index_obj['payload'], basestring) else "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)
Exemple #3
0
    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()
Exemple #4
0
def text_to_html_lines(text, refs=(), regions=()):
    """Run the full pipeline, and return a list of htmlified lines of ``text``
    with markup interspersed for ``regions``."""
    lines = split_content_lines(text)
    offsets = build_offset_map(lines)
    return [
        html_line(text_line, e, o) for (text_line, e, o) in zip(
            lines, tags_per_line(finished_tags(lines, refs, regions)), offsets)
    ]
Exemple #5
0
def text_to_html_lines(text, refs=(), regions=()):
    """Run the full pipeline, and return a list of htmlified lines of ``text``
    with markup interspersed for ``regions``."""
    lines = text.splitlines(True)
    offsets = cumulative_sum(map(len, lines))
    return [html_line(text_line, e, o) for (text_line, e, o) in
            zip(lines, tags_per_line(finished_tags(lines,
                                                   refs,
                                                   regions)), offsets)]
Exemple #6
0
def text_to_html_lines(text, refs=(), regions=()):
    """Run the full pipeline, and return a list of htmlified lines of ``text``
    with markup interspersed for ``regions``."""
    lines = text.splitlines(True)
    offsets = cumulative_sum(map(len, lines))
    return [
        html_line(text_line, e, o) for (text_line, e, o) in zip(
            lines, tags_per_line(finished_tags(lines, refs, regions)), offsets)
    ]
Exemple #7
0
    def docs():
        """Yield documents for bulk indexing."""
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = [{'order': order,
                    'heading': heading,
                    'items': [{'icon': icon,
                                'title': title,
                                'href': href}
                            for icon, title, href in items]}
                    for order, heading, items in
                    chain.from_iterable(linkses)]
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines. If it's an empty file (no lines), don't bother
        # ES. It hates empty dicts.
        if is_text and needles_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)
Exemple #8
0
def text_to_html_lines(text, refs=(), regions=()):
    """Run the full pipeline, and return a list of htmlified lines of ``text``
    with markup interspersed for ``regions``."""
    lines = split_content_lines(text)
    offsets = build_offset_map(lines)
    return [html_line(text_line, e, o) for (text_line, e, o) in
            zip(lines, tags_per_line(finished_tags(lines,
                                                   refs,
                                                   regions)), offsets)]
Exemple #9
0
def _browse_file(tree, path, line_docs, file_doc, config, is_binary,
                 date=None, contents=None, image_rev=None):
    """Return a rendered page displaying a source file.

    :arg string tree: name of tree on which file is found
    :arg string path: relative path from tree root of file
    :arg list line_docs: LINE documents as defined in the mapping of core.py,
        where the `content` field is dereferenced
    :arg file_doc: the FILE document as defined in core.py
    :arg config: TreeConfig object of this tree
    :arg is_binary: Whether file is binary or not
    :arg date: a formatted string representing the generated date, default to now
    :arg string contents: the contents of the source file, defaults to joining
        the `content` field of all line_docs
    :arg image_rev: revision number of a textual or binary image, for images
        displayed at a certain rev
    """
    def process_link_templates(sections):
        """Look for {{line}} in the links of given sections, and duplicate them onto
        a 'template' field.
        """
        for section in sections:
            for link in section['items']:
                if '{{line}}' in link['href']:
                    link['template'] = link['href']
                    link['href'] = link['href'].replace('{{line}}', '')

    def sidebar_links(sections):
        """Return data structure to build nav sidebar from. ::

            [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])]

        """
        process_link_templates(sections)
        # Sort by order, resolving ties by section name:
        return sorted(sections, key=lambda section: (section['order'],
                                                     section['heading']))

    if not date:
        # Then assume that the file is generated now. Remark: we can't use this
        # as the default param because that is only evaluated once, so the same
        # time would always be used.
        date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")

    common = _build_common_file_template(tree, path, is_binary, date, config)
    links = file_doc.get('links', [])
    if is_binary_image(path):
        return render_template(
            'image_file.html',
            **merge(common, {
                'sections': sidebar_links(links),
                'revision': image_rev}))
    elif is_binary:
        return render_template(
            'text_file.html',
            **merge(common, {
                'lines': [],
                'is_binary': True,
                'sections': sidebar_links(links)}))
    else:
        # We concretize the lines into a list because we iterate over it multiple times
        lines = [doc['content'] for doc in line_docs]
        if not contents:
            # If contents are not provided, we can reconstruct them by
            # stitching the lines together.
            contents = ''.join(lines)
        offsets = build_offset_map(lines)
        tree_config = config.trees[tree]
        if is_textual_image(path) and image_rev:
            # Add a link to view textual images on revs:
            links.extend(dictify_links([
                (4,
                 'Image',
                 [('svgview', 'View', url_for('.raw_rev',
                                              tree=tree_config.name,
                                              path=path,
                                              revision=image_rev))])]))
        # Construct skimmer objects for all enabled plugins that define a
        # file_to_skim class.
        skimmers = [plugin.file_to_skim(path,
                                        contents,
                                        plugin.name,
                                        tree_config,
                                        file_doc,
                                        line_docs)
                    for plugin in tree_config.enabled_plugins
                    if plugin.file_to_skim]
        skim_links, refses, regionses, annotationses = skim_file(skimmers, len(line_docs))
        index_refs = (Ref.es_to_triple(ref, tree_config) for ref in
                      chain.from_iterable(doc.get('refs', [])
                                          for doc in line_docs))
        index_regions = (Region.es_to_triple(region) for region in
                         chain.from_iterable(doc.get('regions', [])
                                             for doc in line_docs))
        tags = finished_tags(lines,
                             chain(chain.from_iterable(refses), index_refs),
                             chain(chain.from_iterable(regionses), index_regions))
        return render_template(
            'text_file.html',
            **merge(common, {
                # Someday, it would be great to stream this and not concretize
                # the whole thing in RAM. The template will have to quit
                # looping through the whole thing 3 times.
                'lines': [(html_line(doc['content'], tags_in_line, offset),
                           doc.get('annotations', []) + skim_annotations)
                          for doc, tags_in_line, offset, skim_annotations
                              in izip(line_docs, tags_per_line(tags), offsets, annotationses)],
                'sections': sidebar_links(links + skim_links),
                'query': request.args.get('q', ''),
                'bubble': request.args.get('redirect_type')}))
Exemple #10
0
def _browse_file(tree,
                 path,
                 line_docs,
                 file_doc,
                 config,
                 date=None,
                 contents=None):
    """Return a rendered page displaying a source file.

    :arg string tree: name of tree on which file is found
    :arg string path: relative path from tree root of file
    :arg list line_docs: LINE documents as defined in the mapping of core.py,
        where the `content` field is dereferenced
    :arg file_doc: the FILE document as defined in core.py
    :arg config: TreeConfig object of this tree
    :arg date: a formatted string representing the generated date, default to now
    :arg string contents: the contents of the source file, defaults to joining
        the `content` field of all line_docs
    """
    def sidebar_links(sections):
        """Return data structure to build nav sidebar from. ::

            [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])]

        """
        # Sort by order, resolving ties by section name:
        return sorted(sections,
                      key=lambda section:
                      (section['order'], section['heading']))

    if not date:
        # Then assume that the file is generated now. Remark: we can't use this
        # as the default param because that is only evaluated once, so the same
        # time would always be used.
        date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")

    common = _build_common_file_template(tree, path, date, config)
    links = file_doc.get('links', [])
    if is_binary_image(path):
        return render_template('image_file.html', **common)
    else:  # We don't allow browsing binary files, so this must be a text file.
        # We concretize the lines into a list because we iterate over it multiple times
        lines = [doc['content'] for doc in line_docs]
        if not contents:
            # If contents are not provided, we can reconstruct them by
            # stitching the lines together.
            contents = ''.join(lines)
        offsets = cumulative_sum(imap(len, lines))
        tree_config = config.trees[tree]
        # Construct skimmer objects for all enabled plugins that define a
        # file_to_skim class.
        skimmers = [
            plugin.file_to_skim(path, contents, plugin.name, tree_config,
                                file_doc, line_docs)
            for plugin in tree_config.enabled_plugins if plugin.file_to_skim
        ]
        skim_links, refses, regionses, annotationses = skim_file(
            skimmers, len(line_docs))
        index_refs = (Ref.es_to_triple(ref, tree_config)
                      for ref in chain.from_iterable(
                          doc.get('refs', []) for doc in line_docs))
        index_regions = (Region.es_to_triple(region)
                         for region in chain.from_iterable(
                             doc.get('regions', []) for doc in line_docs))
        tags = finished_tags(
            lines, chain(chain.from_iterable(refses), index_refs),
            chain(chain.from_iterable(regionses), index_regions))
        return render_template(
            'text_file.html',
            **merge(
                common,
                {
                    # Someday, it would be great to stream this and not concretize
                    # the whole thing in RAM. The template will have to quit
                    # looping through the whole thing 3 times.
                    'lines': [(html_line(doc['content'], tags_in_line, offset),
                               doc.get('annotations', []) + skim_annotations)
                              for doc, tags_in_line, offset, skim_annotations
                              in izip(line_docs, tags_per_line(tags), offsets,
                                      annotationses)],
                    'is_text':
                    True,
                    'sections':
                    sidebar_links(links + skim_links)
                }))
Exemple #11
0
def _browse_file(tree, path, line_docs, file_doc, config, date=None, contents=None):
    """Return a rendered page displaying a source file.

    :arg string tree: name of tree on which file is found
    :arg string path: relative path from tree root of file
    :arg list line_docs: LINE documents as defined in the mapping of core.py,
        where the `content` field is dereferenced
    :arg file_doc: the FILE document as defined in core.py
    :arg config: TreeConfig object of this tree
    :arg date: a formatted string representing the generated date, default to now
    :arg string contents: the contents of the source file, defaults to joining
        the `content` field of all line_docs
    """
    def sidebar_links(sections):
        """Return data structure to build nav sidebar from. ::

            [('Section Name', [{'icon': ..., 'title': ..., 'href': ...}])]

        """
        # Sort by order, resolving ties by section name:
        return sorted(sections, key=lambda section: (section['order'],
                                                     section['heading']))

    if not date:
        # Then assume that the file is generated now. Remark: we can't use this
        # as the default param because that is only evaluated once, so the same
        # time would always be used.
        date = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")

    common = _build_common_file_template(tree, path, date, config)
    links = file_doc.get('links', [])
    if is_image(path):
        return render_template(
            'image_file.html',
            **common)
    else:  # We don't allow browsing binary files, so this must be a text file.
        # We concretize the lines into a list because we iterate over it multiple times
        lines = [doc['content'] for doc in line_docs]
        if not contents:
            # If contents are not provided, we can reconstruct them by
            # stitching the lines together.
            contents = ''.join(lines)
        offsets = cumulative_sum(imap(len, lines))
        tree_config = config.trees[tree]
        # Construct skimmer objects for all enabled plugins that define a
        # file_to_skim class.
        skimmers = [plugin.file_to_skim(path,
                                        contents,
                                        plugin.name,
                                        tree_config,
                                        file_doc,
                                        line_docs)
                    for plugin in tree_config.enabled_plugins
                    if plugin.file_to_skim]
        skim_links, refses, regionses, annotationses = skim_file(skimmers, len(line_docs))
        index_refs = (Ref.es_to_triple(ref, tree_config) for ref in
                      chain.from_iterable(doc.get('refs', [])
                                          for doc in line_docs))
        index_regions = (Region.es_to_triple(region) for region in
                         chain.from_iterable(doc.get('regions', [])
                                             for doc in line_docs))
        tags = finished_tags(lines,
                             chain(chain.from_iterable(refses), index_refs),
                             chain(chain.from_iterable(regionses), index_regions))
        return render_template(
            'text_file.html',
            **merge(common, {
                # Someday, it would be great to stream this and not concretize
                # the whole thing in RAM. The template will have to quit
                # looping through the whole thing 3 times.
                'lines': [(html_line(doc['content'], tags_in_line, offset),
                           doc.get('annotations', []) + skim_annotations)
                          for doc, tags_in_line, offset, skim_annotations
                              in izip(line_docs, tags_per_line(tags), offsets, annotationses)],
                'is_text': True,
                'sections': sidebar_links(links + skim_links)}))