Exemple #1
0
    def docs():
        """Yield documents for bulk indexing."""
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(  # Some non-array fields:
            folder=folder_name,
            name=file_name,
            size=file_info.st_size,
            modified=datetime.fromtimestamp(file_info.st_mtime),
            is_folder=False,

            # And these, which all get mashed into arrays:
            **needles)
        links = [{
            'order':
            order,
            'heading':
            heading,
            'items': [{
                'icon': icon,
                'title': title,
                'href': href
            } for icon, title, href in items]
        } for order, heading, items in chain.from_iterable(linkses)]
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line, annotations_by_line,
                    es_lines(
                        finished_tags(lines, chain.from_iterable(refses),
                                      chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(
                    tags, lambda index_obj: "regions" if isinstance(
                        index_obj['payload'], basestring) else "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)
Exemple #2
0
    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()
Exemple #3
0
    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()
Exemple #4
0
    def docs():
        """Yield documents for bulk indexing."""
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    modified=datetime.fromtimestamp(file_info.st_mtime),
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = [{'order': order,
                    'heading': heading,
                    'items': [{'icon': icon,
                                'title': title,
                                'href': href}
                            for icon, title, href in items]}
                    for order, heading, items in
                    chain.from_iterable(linkses)]
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines. If it's an empty file (no lines), don't bother
        # ES. It hates empty dicts.
        if is_text and needles_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)