Beispiel #1
0
def test_append_update_by_line():
    eq_(append_update_by_line([{}, {'o': ['hai'], 'p': ['pod']}],
                              [[('q', 'bert')],
                               [('o', 'hi'), ('o', 'no'), ('p', 'diddy')]]),
        [{'q': ['bert']}, {'o': ['hai', 'hi', 'no'], 'p': ['pod', 'diddy']}])
Beispiel #2
0
def index_file(tree, tree_indexers, path, es, index):
    """Index a single file into ES, and build a static HTML representation of it.

    For the moment, we execute plugins in series, figuring that we have plenty
    of files to keep our processors busy in most trees that take very long. I'm
    a little afraid of the cost of passing potentially large TreesToIndex to
    worker processes. That goes at 52MB/s on my OS X laptop, measuring by the
    size of the pickled object and including the pickling and unpickling time.

    :arg path: Absolute path to the file to index
    :arg index: The ES index name

    """
    try:
        contents = unicode_contents(path, tree.source_encoding)
    except IOError as exc:
        if exc.errno == ENOENT and islink(path):
            # It's just a bad symlink (or a symlink that was swiped out
            # from under us--whatever)
            return
        else:
            raise

    # Just like index_folders, if the path is not in UTF-8, then elasticsearch
    # will not accept the path, so just move on.
    rel_path = relpath(path, tree.source_folder)
    is_text = isinstance(contents, unicode)
    is_link = islink(path)
    # Index by line if the contents are text and the path is not a symlink.
    index_by_line = is_text and not is_link
    if index_by_line:
        lines = split_content_lines(contents)
        num_lines = len(lines)
        needles_by_line = [{} for _ in xrange(num_lines)]
        annotations_by_line = [[] for _ in xrange(num_lines)]
        refses, regionses = [], []
    needles = {}
    linkses = []

    for tree_indexer in tree_indexers:
        file_to_index = tree_indexer.file_to_index(rel_path, contents)
        if file_to_index.is_interesting():
            # Per-file stuff:
            append_update(needles, file_to_index.needles())
            if not is_link:
                linkses.append(file_to_index.links())

            # Per-line stuff:
            if index_by_line:
                refses.append(file_to_index.refs())
                regionses.append(file_to_index.regions())
                append_update_by_line(needles_by_line,
                                      file_to_index.needles_by_line())
                append_by_line(annotations_by_line,
                               file_to_index.annotations_by_line())

    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()

    # Indexing a 277K-line file all in one request makes ES time out (>60s),
    # so we chunk it up. 300 docs is optimal according to the benchmarks in
    # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like
    # images don't make our chunk sizes ridiculous, there's a size ceiling as
    # well: 10000 is based on the 300 and an average of 31 chars per line.
    for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000):
        es.bulk(chunk, index=index, doc_type=LINE)