Exemple #1
0
def update_index(session, index):
    print("Calculating query size...")
    query = session.query(Message)#.filter(Message.sent_at > '2016-01-01').filter_by(list_id='python-dev')
    count = query.count()
    writer = index.writer()

    with tqdm(total=count) as pbar:
        for idx, message in enumerate(query.yield_per(100)):
            pbar.update(1)
            if not message.text:
                continue

            writer.add_document(
                list_id=message.list_id,
                message_id=message.message_id,
                content=clean_message(message.text),
                author=message.author,
                sent_at=message.sent_at,
                thread_parent=message.thread_parent,
                thread_idx=message.thread_idx,
                thread_indent=message.thread_indent,
                page=message.page,
                subject=message.subject,
            )
            if idx % 10000 == 0 and idx != 0:
                pbar.write("Comitting at doc {}...".format(idx))
                writer.commit()
                writer = index.writer()
        pbar.write("Comitting at doc {}...".format(idx+1))
    writer.commit()
Exemple #2
0
    def update(self, force_rebuild=False):
        """ Adds/updates all items in repo to index. Note: querying will call this automatically."""

        # if we've already updated the index during this script run, we're done!
        if self.index_updated:
            return False

        # if the index is not based on the current commit, rebuild from scratch
        if not self.index_based_on_current_commit():
            force_rebuild = True

        if force_rebuild:
            # get a new clean/empty index
            index = self.get_index(force_rebuild)
            index_writer = index.writer()

            # index all documents
            documents = self.document_iterator()
            activity_description = 'Rebuilding'
        else:
            # use the current index
            index = self.get_index()
            index_writer = index.writer()

            # delete uncommitted files that are in index already
            for filepath in self.get_indexed_uncommitted_files():
                index_writer.delete_by_term('path', filepath)

            # get list of uncommitted files and persist it
            uncommitted_files = lib_git.get_uncommitted_oval()
            self.set_indexed_uncommitted_files(uncommitted_files)

            # if there are no uncommitted files to index, we're done
            if not uncommitted_files:
                index_writer.commit()
                return False

            # index only uncommitted files
            documents = self.document_iterator(uncommitted_files)
            activity_description = 'Updating'

        # add all definition files to index
        counter = 0
        for document in documents:
            counter = counter + 1
            self.status_spinner(counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label)
            if 'deleted' in document and document['deleted']:
                index_writer.delete_by_term('path', document['path'])
                #self.message('debug', 'Deleting from index:\n\t{0} '.format(document['path']))
            else:
                index_writer.add_document(**document)
                #self.message('debug', 'Upserting to index:\n\t{0} '.format(document['path']))
        index_writer.commit()
        self.status_spinner(counter, '{0} {1} index'.format(activity_description, self.index_name), self.item_label, True)

        # update indexed commit
        self.set_indexed_commit_hash()
        self.index_updated = True
 def _get_index(self, index_path, index_name):
     if not whoosh.index.exists_in(index_path, index_name):
         print 'creating %s index at %s' % (index_name, index_path)
         if not os.path.exists(index_path):
             os.makedirs(index_path)
         schema = whoosh.fields.Schema(
                 id = whoosh.fields.ID(stored=True, unique=True),
                 artist = whoosh.fields.TEXT(stored=True),
                 title = whoosh.fields.TEXT(stored=True),
                 lyrics = whoosh.fields.TEXT(stored=True),
                 )
         index = whoosh.index.create_in(index_path, schema, index_name)
         index.writer().commit()
     return whoosh.index.open_dir(index_path, index_name)
Exemple #4
0
def create_index_writer(index_path):
    '''
    Constructs a whoosh index writer, which has ID, artist and title fields

    :parameters:
        - index_path : str
            Path to whoosh index to be written

    :returns:
        - index : whoosh.writing.IndexWriter
            Whoosh index writer
    '''
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    A = (whoosh.analysis.StandardAnalyzer(stoplist=None, minsize=1)
         | whoosh.analysis.CharsetFilter(accent_map))

    Schema = whoosh.fields.Schema(id=whoosh.fields.ID(stored=True),
                                  path=whoosh.fields.TEXT(stored=True),
                                  artist=whoosh.fields.TEXT(stored=True,
                                                            analyzer=A),
                                  title=whoosh.fields.TEXT(stored=True,
                                                           analyzer=A))

    index = whoosh.index.create_in(index_path, Schema)
    return index.writer()
Exemple #5
0
  def after_commit(self, session):
    """
    Any db updates go through here. We check if any of these models have
    ``__searchable__`` fields, indicating they need to be indexed. With these
    we update the whoosh index for the model. If no index exists, it will be
    created here; this could impose a penalty on the initial commit of a model.
    """

    for typ, values in self.to_update.iteritems():
      model_class = values[0][1].__class__
      index = self.index_for_model_class(model_class)
      with index.writer() as writer:
        primary_field = model_class.search_query.primary
        searchable = model_class.__searchable__

        for change_type, model in values:
          # delete everything. stuff that's updated or inserted will get
          # added as a new doc. Could probably replace this with a whoosh
          # update.

          writer.delete_by_term(primary_field, unicode(getattr(model, primary_field)))

          if change_type in ("new", "changed"):
            attrs = dict((key, getattr(model, key)) for key in searchable)
            attrs[primary_field] = unicode(getattr(model, primary_field))
            writer.add_document(**attrs)

    self.to_update = {}
def _after_flush(app, changes):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.

    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ("update", "insert")

        if hasattr(change[0].__class__, __searchable__):
            bytype.setdefault(change[0].__class__.__name__, []).append((update, change[0]))

    for model, values in bytype.items():
        index = whoosh_index(app, values[0][1].__class__)
        with index.writer() as writer:
            primary_field = values[0][1].pure_whoosh.primary_key_name
            searchable = values[0][1].__searchable__

            for update, v in values:
                if update:
                    attrs = {}
                    for key in searchable:
                        try:
                            attrs[key] = unicode(getattr(v, key))
                        except AttributeError:
                            raise AttributeError("{0} does not have {1} field {2}".format(model, __searchable__, key))

                    attrs[primary_field] = unicode(getattr(v, primary_field))
                    writer.update_document(**attrs)
                else:
                    writer.delete_by_term(primary_field, unicode(getattr(v, primary_field)))
def build_index(sa_session, toolshed_whoosh_index_dir):
    storage = FileStorage(toolshed_whoosh_index_dir)
    index = storage.create_index(schema)
    writer = index.writer()

    def to_unicode(a_basestr):
        if type(a_basestr) is str:
            return unicode(a_basestr, 'utf-8')
        else:
            return a_basestr

    repos_indexed = 0
    for (id, name, description, long_description, homepage_url,
         remote_repository_url, repo_owner_username, times_downloaded,
         approved, last_updated, full_last_updated) in get_repos(sa_session):

        writer.add_document(
            id=id,
            name=to_unicode(name),
            description=to_unicode(description),
            long_description=to_unicode(long_description),
            homepage_url=to_unicode(homepage_url),
            remote_repository_url=to_unicode(remote_repository_url),
            repo_owner_username=to_unicode(repo_owner_username),
            times_downloaded=times_downloaded,
            approved=approved,
            last_updated=last_updated,
            full_last_updated=full_last_updated)
        repos_indexed += 1
    writer.commit()
    print "Number of repos indexed: ", repos_indexed
def index_one_record(record, delete=False, writer=None, index_parent=False):
    index = whoosh_index(current_app, record.__class__)
    close = False
    if not writer:
        writer = index.writer()
        close = True
    if index_parent:
        # index parent class
        parent_writer = whoosh_index(
            current_app, record.__class__.__base__).writer()
    primary_field = record.pure_whoosh.primary_key_name
    searchable = index.schema.names()
    if not delete:
        attrs = {}
        for key in searchable:
            attrs[key] = str(getattr(record, key))
        attrs[primary_field] = str(
            getattr(record, primary_field))
        writer.update_document(**attrs)
        if index_parent:
            parent_writer.update_document(**attrs)
    else:
        writer.delete_by_term(
            primary_field, str(getattr(record, primary_field)))
        if index_parent:
            parent_writer.delete_by_term(
                primary_field, str(getattr(record, primary_field)))
    if close:
        writer.commit()
Exemple #9
0
def update_documentation_index():
    from flask_website.docs import DocumentationPage
    writer = index.writer()
    for page in DocumentationPage.iter_pages():
        page.remove_from_search_index(writer)
        page.add_to_search_index(writer)
    writer.commit()
  def after_commit(self, session):
    """
    Any db updates go through here. We check if any of these models have
    ``__searchable__`` fields, indicating they need to be indexed. With these
    we update the whoosh index for the model. If no index exists, it will be
    created here; this could impose a penalty on the initial commit of a model.
    """

    for typ, values in self.to_update.iteritems():
      model_class = values[0][1].__class__
      index = self.index_for_model_class(model_class)
      with index.writer() as writer:
        primary_field = model_class.search_query.primary
        searchable = model_class.__searchable__

        for change_type, model in values:
          # delete everything. stuff that's updated or inserted will get
          # added as a new doc. Could probably replace this with a whoosh
          # update.

          if change_type == "deleted":
              writer.delete_by_term(primary_field, unicode(getattr(model, primary_field)))
          else:
            attrs = dict((key, getattr(model, key)) for key in searchable)
            attrs[primary_field] = unicode(getattr(model, primary_field))
            if change_type == "new":
                writer.add_document(**attrs)
            elif change_type == "changed":
                writer.update_document(**attrs)

    self.to_update = {}
def _after_flush(app, changes):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.
    if app.config.get('WHOOSH_DISABLED') is True:
        return
    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ('update', 'insert')

        if hasattr(change[0].__class__, __searchable__):
            bytype.setdefault(change[0].__class__.__name__, []).append(
                (update, change[0]))
    if not bytype:
        return
    try:
        for model, values in list(bytype.items()):
            index = whoosh_index(app, values[0][1].__class__)
            with index.writer() as writer:
                for update, v in values:
                    has_parent = isinstance(
                        v.__class__.__base__, DeclarativeMeta) and \
                                 hasattr(v.__class__.__base__, '__searchable__')
                    index_one_record(
                        v, not update, writer, index_parent=has_parent)
    except Exception as ex:
        logging.error("FAIL updating index of %s msg: %s" % (model, str(ex)))
Exemple #12
0
def create_search_index():
    """
    Set up a Whoosh search index based on the keys in the given spellbook.
    """
    # Try to open the index if it already exists and is recent.
    if os.path.exists(config['search_index_path']):
        idx_modified = os.path.getmtime(config['search_index_path'])
        spellbook_modified = os.path.getmtime(env['spellbook_path'])

        if spellbook_modified < idx_modified:
            return whoosh.index.open_dir(config['search_index_path'])
    else:
        os.makedirs(config['search_index_path'])

    schema = whoosh.fields.Schema(name=whoosh.fields.NGRAMWORDS(stored=True),
                                  contents=whoosh.fields.STORED)

    index = whoosh.index.create_in(config['search_index_path'], schema)
    writer = index.writer()

    for spell_name, spell_contents in env['flat_spellbook'].items():
        writer.add_document(name=spell_name, contents=spell_contents)

    writer.commit()

    return index
Exemple #13
0
def update_documentation_index():
    from flask_website.docs import DocumentationPage
    writer = index.writer()
    for page in DocumentationPage.iter_pages():
        page.remove_from_search_index(writer)
        page.add_to_search_index(writer)
    writer.commit()
Exemple #14
0
def reindex_snippets():
    from flask_website.database import Snippet
    writer = index.writer()
    for snippet in Snippet.query.all():
        snippet.remove_from_search_index(writer)
        snippet.add_to_search_index(writer)
    writer.commit()
Exemple #15
0
def after_flush(app, changes):
    ''' Any db updates go through here. We check if any of these models have
    ``__searchable__`` fields, indicating they need to be indexed. With these
    we update the whoosh index for the model. If no index exists, it will be
    created here; this could impose a penalty on the initial commit of a model.
    '''

    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ('update', 'insert')

        if hasattr(change[0].__class__, '__searchable__'):
            bytype.setdefault(change[0].__class__.__name__, []).append((update, change[0]))

    for typ, values in bytype.iteritems():
        index = whoosh_index(app, values[0][1])
        with index.writer() as writer:
            primary_field = values[0][1].search_query.primary
            searchable = values[0][1].__searchable__

            for update, v in values:
                # delete everything. stuff that's updated or inserted will get
                # added as a new doc. Could probably replace this with a whoosh
                # update.

                writer.delete_by_term(primary_field, unicode(getattr(v, primary_field)))

                if update:
                    attrs = dict((key, getattr(v, key)) for key in searchable)
                    attrs[primary_field] = unicode(getattr(v, primary_field))
                    writer.add_document(**attrs)
Exemple #16
0
def update_model_based_indexes(session, flush_context):
    """Called by a session event, updates the model based documents."""
    to_delete = []
    to_add = []
    for model in session.new:
        if isinstance(model, Indexable):
            to_add.append(model)

    for model in session.dirty:
        if isinstance(model, Indexable):
            to_delete.append(model)
            to_add.append(model)

    for model in session.dirty:
        if isinstance(model, Indexable):
            to_delete.append(model)

    if not (to_delete or to_add):
        return

    writer = index.writer()
    for model in to_delete:
        model.remove_from_search_index(writer)
    for model in to_add:
        model.add_to_search_index(writer)
    writer.commit()
def index_one_record(record, delete=False, writer=None, index_parent=False):
    index = whoosh_index(current_app, record.__class__)
    close = False
    if not writer:
        writer = index.writer()
        close = True
    if index_parent:
        # index parent class
        parent_writer = whoosh_index(
            current_app, record.__class__.__base__).writer()
    primary_field = record.pure_whoosh.primary_key_name
    searchable = index.schema.names()
    if not delete:
        attrs = {}
        for key in searchable:
            attrs[key] = unicode(getattr(record, key))
        attrs[primary_field] = unicode(
            getattr(record, primary_field))
        writer.update_document(**attrs)
        if index_parent:
            parent_writer.update_document(**attrs)
    else:
        writer.delete_by_term(
            primary_field, unicode(getattr(record, primary_field)))
        if index_parent:
            parent_writer.delete_by_term(
                primary_field, unicode(getattr(record, primary_field)))
    if close:
        writer.commit()
Exemple #18
0
def reindex_snippets():
    from flask_website.database import Snippet
    writer = index.writer()
    for snippet in Snippet.query.all():
        snippet.remove_from_search_index(writer)
        snippet.add_to_search_index(writer)
    writer.commit()
Exemple #19
0
    def index_objects(self, objects, index='default'):
        """Bulk index a list of objects."""
        if not objects:
            return

        index_name = index
        index = self.app_state.indexes[index_name]
        indexed = set()

        with index.writer() as writer:
            for obj in objects:
                document = self.get_document(obj)
                if document is None:
                    continue

                object_key = document['object_key']
                if object_key in indexed:
                    continue

                writer.delete_by_term('object_key', object_key)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error(
                        'writer.add_document(%r)',
                        document,
                        exc_info=True,
                    )
                    raise
                indexed.add(object_key)
Exemple #20
0
def update_model_based_indexes(session, flush_context):
    """Called by a session event, updates the model based documents."""
    to_delete = []
    to_add = []
    for model in session.new:
        if isinstance(model, Indexable):
            to_add.append(model)

    for model in session.dirty:
        if isinstance(model, Indexable):
            to_delete.append(model)
            to_add.append(model)

    for model in session.dirty:
        if isinstance(model, Indexable):
            to_delete.append(model)

    if not (to_delete or to_add):
        return

    writer = index.writer()
    for model in to_delete:
        model.remove_from_search_index(writer)
    for model in to_add:
        model.add_to_search_index(writer)
    writer.commit()
Exemple #21
0
def scrape_profiles(index):
    for profile_type in PROFILE_TYPES:
        print(f'Processing profile type: {profile_type}')
        profiles = scrape_profiles_of_type(profile_type)
        with index.writer() as index_writer:
            for profile in profiles:
                index_writer.add_document(**profile)
Exemple #22
0
def create_index_writer(index_path):
    '''
    Constructs a whoosh index writer, which has ID, artist and title fields

    :parameters:
        - index_path : str
            Path to whoosh index to be written

    :returns:
        - index : whoosh.writing.IndexWriter
            Whoosh index writer
    '''
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    A = (whoosh.analysis.StandardAnalyzer(stoplist=None, minsize=1) |
         whoosh.analysis.CharsetFilter(accent_map))

    Schema = whoosh.fields.Schema(
        id=whoosh.fields.ID(stored=True),
        path=whoosh.fields.TEXT(stored=True),
        artist=whoosh.fields.TEXT(stored=True, analyzer=A),
        title=whoosh.fields.TEXT(stored=True, analyzer=A))

    index = whoosh.index.create_in(index_path, Schema)
    return index.writer()
def _after_flush(app, changes):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.
    if app.config.get('WHOOSH_DISABLED') is True:
        return
    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ('update', 'insert')

        if hasattr(change[0].__class__, __searchable__):
            bytype.setdefault(change[0].__class__.__name__, []).append(
                (update, change[0]))
    if not bytype:
        return
    try:
        for model, values in bytype.items():
            index = whoosh_index(app, values[0][1].__class__)
            with index.writer() as writer:
                for update, v in values:
                    has_parent = isinstance(
                        v.__class__.__base__, DeclarativeMeta)
                    index_one_record(
                        v, not update, writer, index_parent=has_parent)
    except Exception as ex:
        logging.warning("FAIL updating index of %s msg: %s" % (model, str(ex)))
Exemple #24
0
def create_index_writer(index_path):
    '''Create a new whoosh index in the given directory path.
    
    Input: directory in which to create the index
    
    Output: `whoosh.index` writer object
    '''
    

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    analyzer = (whoosh.analysis.StemmingAnalyzer() | 
                whoosh.analysis.CharsetFilter(accent_map))

    schema = whoosh.fields.Schema(track_id=whoosh.fields.STORED,
                                  title=whoosh.fields.TEXT(stored=True, analyzer=analyzer),
                                  artist=whoosh.fields.TEXT(stored=True, analyzer=analyzer),
                                  album=whoosh.fields.TEXT(stored=True, analyzer=analyzer),
                                  collection=whoosh.fields.KEYWORD(stored=True),
                                  collection_id=whoosh.fields.NUMERIC(stored=True))

    index = whoosh.index.create_in(index_path, schema)

    return index.writer()
Exemple #25
0
def _get_writer(index):
    writer = None
    while writer is None:
        try:
            writer = index.writer()
        except whoosh.index.LockError:
            time.sleep(0.25)

    return writer
Exemple #26
0
def _get_writer(index):
    writer = None
    while writer is None:
        try:
            writer = index.writer()
        except whoosh.index.LockError:
            time.sleep(0.25)

    return writer
def add_email(index, base, email_id):
    email_bytes = get_from_wikileaks_by_index(base, email_id)
    content = analyzer.retrieve_email_content(email_bytes,
                                              base + str(email_id))
    subject = analyzer.retrieve_subject(email_bytes)
    writer = index.writer()
    writer.add_document(url=base + str(email_id),
                        content=content,
                        subject=subject)
    writer.commit()
 def rebuild_index_model(self, model_class, session):
   index = self.index_for_model_class(model_class)
   with index.writer() as writer:
     primary_field = model_class.search_query.primary
     searchable = model_class.__searchable__
     for i in session.query(model_class):
       attrs = dict((key, getattr(i, key)) for key in searchable)
       attrs[primary_field] = unicode(getattr(i, primary_field))
       writer.delete_by_term(primary_field, unicode(getattr(i, primary_field)))
       writer.add_document(**attrs)
Exemple #29
0
def appendtextindex(table,
                    index_or_dirname,
                    indexname=None,
                    merge=True,
                    optimize=False):
    """
    Load all rows from `table` into a Whoosh index, adding them to any existing
    data in the index.

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=False)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize)

    except Exception:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Exemple #30
0
def handle_postupdate(item_id): # pragma: no cover
    """Insert item data into indexer.

    """
    item = icecrate.items.by_item_id(item_id)

    writer = index.writer()
    writer.update_document(
        upc=item_id,
        name=item.get("name"),
        tags=list(icecrate.tags._split_tags(item.get("tags", ""))))
    writer.commit()
def createIndexWriter(indexPath):
    if not os.path.exists(indexPath):
        os.mkdir(indexPath)

    A = whoosh.analysis.FancyAnalyzer() | whoosh.analysis.CharsetFilter(accent_map)

    Schema = whoosh.fields.Schema(  song_id     = whoosh.fields.ID(stored=True),
                                    artist      = whoosh.fields.TEXT(stored=True, analyzer=A),
                                    title       = whoosh.fields.TEXT(stored=True, analyzer=A))

    index = whoosh.index.create_in(indexPath, Schema)
    return index.writer()
    pass
Exemple #32
0
def appendtextindex(table, index_or_dirname, indexname=None, merge=True,
                    optimize=False):
    """
    Load all rows from `table` into a Whoosh index, adding them to any existing
    data in the index.

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname, indexname=indexname,
                                      readonly=False)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r'
                            % index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize)

    except Exception:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
def write_db():
    index = storage.create_index(schema)
    writer = index.writer()
    
    # read doc from Database by using django
    for dou in Preview.objects.all():
        doc = {}
        doc['id'] = _from_python(str(dou.id))
        text = dou.description
        doc[index_fieldname] = text
        try:
            writer.update_document(**doc)
        except Exception, e:
            raise
Exemple #34
0
def write_db(storage,schema):
    index = storage.create_index(schema)
    writer = index.writer()
        
    #for dou in DoubanMovie.objects.filter(id__lte=4000).annotate(cnt=Count('movielink')).filter(cnt__gt=0).order_by('-cnt'):
    for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')):
        doc = {}
        doc['id'] = _from_python(str(obj.id))
        doc['title'] = obj.title
        doc['content'] = obj.content
        try:
            writer.update_document(**doc)
        except Exception, e:
            raise
    def reindex(self):
        """Reindex all data

        This method retrieves all the data from the registered models and
        calls the ``update_<model>()`` function for every instance of such
        model.
        """
        for wh in self.whoosheers:
            index = type(self).get_or_create_index(_get_app(self), wh)
            with index.writer(timeout=_get_config(self)['writer_timeout']) as writer:
                for model in wh.models:
                    method_name = "{0}_{1}".format(UPDATE_KWD, model.__name__.lower())
                    for item in model.query.all():
                        getattr(wh, method_name)(writer, item)
Exemple #36
0
def write_db(storage, schema):
    index = storage.create_index(schema)
    writer = index.writer()

    #for dou in DoubanMovie.objects.filter(id__lte=4000).annotate(cnt=Count('movielink')).filter(cnt__gt=0).order_by('-cnt'):
    for obj in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')):
        doc = {}
        doc['id'] = _from_python(str(obj.id))
        doc['title'] = obj.title
        doc['content'] = obj.content
        try:
            writer.update_document(**doc)
        except Exception, e:
            raise
Exemple #37
0
def write_db():
    index = storage.create_index(schema)
    writer = index.writer()

    # read doc from Database by using django
    for dou in Preview.objects.all():
        doc = {}
        doc['id'] = _from_python(str(dou.id))
        text = dou.description
        doc[index_fieldname] = text
        try:
            writer.update_document(**doc)
        except Exception, e:
            raise
Exemple #38
0
def add_snippet():
    title = request.form['title'].strip()
    content = request.form['content'].strip()
    tag = request.form['tag']
    language = request.form['language']
    snippet_id = unicode(uuid.uuid4())

    if not title or not content:
        raise Exception("Empty title or snippet content")

    writer = index.writer()
    writer.update_document(id=snippet_id, content=content, tag=tag, title=title, language=language)
    writer.commit()

    return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
    def reindex(self):
        """Reindex all data

        This method retrieves all the data from the registered models and
        calls the ``update_<model>()`` function for every instance of such
        model.
        """
        for wh in self.whoosheers:
            index = type(self).get_or_create_index(_get_app(self), wh)
            writer = index.writer(timeout=_get_config(self)['writer_timeout'])
            for model in wh.models:
                method_name = "{0}_{1}".format(UPDATE_KWD, model.__name__.lower())
                for item in model.query.all():
                    getattr(wh, method_name)(writer, item)
            writer.commit()
def _store(session, context):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.

    app = session.app
    changes = session._model_changes.values()

    bytype = {}  # sort changes by type so we can use per-model writer
    for obj, operation in changes:
        if hasattr(obj.__class__, __searchable__):
            bytype.setdefault(obj.__class__, []).append((obj, operation))

    for cls, values in bytype.iteritems():
        index = whoosh_index(app, cls)
        with index.writer() as writer:
            primary_field = cls.pure_whoosh.primary_key_name
            searchable = cls.__searchable__

            for obj, operation in values:
                if operation in ('update', 'insert'):
                    attrs = {}
                    for item in searchable:
                        if isinstance(item, tuple):
                            key, field = item
                        else:
                            key = item
                            field = None
                        try:
                            value = getattr(obj, key)
                        except AttributeError:
                            raise AttributeError('{0} does not have {1} field {2}'
                                    .format(cls.__name__, __searchable__, key))
                        if callable(value):
                            value = value()

                        if field is None or field.__class__ in UNICODE_TYPES:
                            value = unicode(value)

                        attrs[key] = value

                    attrs[primary_field] = unicode(getattr(obj, primary_field))
                    #print "update_document", attrs
                    writer.update_document(**attrs)
                elif operation == 'delete':
                    writer.delete_by_term(primary_field, unicode(getattr(obj, primary_field)))
def loader(index, col):
    """
    takes collection data as input and writes 
    to different indexes.
    """
    writer = index.writer()
    feed_data = get_data(col)
    for doc in feed_data:
        idx = doc["_id"]
        data = doc
        # data = json.dumps(doc)
        # print(data)
        body = dict_values_to_text(doc)
        writer.add_document(idx=idx, data=data, body=body)
    writer.commit()
    print(f"{index} loaded successfully")
def createIndexWriter(indexPath):

    if not os.path.exists(indexPath):
        os.mkdir(indexPath)
        pass

    A = whoosh.analysis.StemmingAnalyzer() | whoosh.analysis.CharsetFilter(accent_map)

    Schema = whoosh.fields.Schema(  song_id     =   whoosh.fields.ID(stored=True),
                                    artist_id   =   whoosh.fields.STORED,
                                    artist      =   whoosh.fields.TEXT(stored=True, field_boost=8.0, analyzer=A),
                                    title       =   whoosh.fields.TEXT(stored=True, field_boost=4.0, analyzer=A),
                                    terms       =   whoosh.fields.KEYWORD(stored=True, scorable=True, commas=True))

    index = whoosh.index.create_in(indexPath, Schema)
    return index.writer()
Exemple #43
0
    def _flush_set(self, _set):
        for instance in _set:
            mapping = instance.__class__
            if mapping not in self.mappings:
                continue

            index = self.indexes[mapping.__name__]
            primary_field = mapping.search_query.primary
            primary_value = text_type(getattr(instance, primary_field))

            with index.writer() as writer:
                attrs = {}
                writer.delete_by_term(primary_field, primary_value)
                attrs[primary_field] = primary_value
                attrs['body'] = instance.index()
                writer.add_document(**attrs)
def _after_flush(app, changes):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.

    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ('update', 'insert')

        if hasattr(change[0].__class__, __searchable__):
            bytype.setdefault(change[0].__class__.__name__, []).append((update,
                change[0]))

    for model, values in bytype.iteritems():
        index = whoosh_index(app, values[0][1].__class__)
        with index.writer() as writer:
            primary_field = values[0][1].pure_whoosh.primary_key_name
            searchable = values[0][1].__searchable__

            for update, v in values:
                if update:
                    attrs = {}
                    for key in searchable:
                        # Dive into related models before going down the
                        # well-worn, original path. Adventure awaits! 
                        if '.' in key:
                            parts = key.split('.')
                            # Let's assume only one-level relationships,
                            # i.e. names contain only one period.                           
                            related_model, field_name = parts
                            attrs[key] = unicode(getattr(getattr(v, related_model), field_name)) 

                        else:
                            try:
                                attrs[key] = unicode(getattr(v, key))
                            except AttributeError:
                                raise AttributeError('{0} does not have {1} field {2}'
                                        .format(model, __searchable__, key))

                    attrs[primary_field] = unicode(getattr(v, primary_field))
                    writer.update_document(**attrs)
                else:
                    writer.delete_by_term(primary_field, unicode(getattr(v,
                        primary_field)))
Exemple #45
0
def update_snippet(snippet_id):
    title = request.form['title']
    content = request.form['content']
    language = request.form['language']
    tag = request.form['tag']

    if not title or not content:
        raise Exception("Empty title or content")

    if not get_snippet_by_id(snippet_id):
        abort(make_response('{"message": "The snippet you are trying to update doesn\'t exist"}', 404))

    writer = index.writer()
    writer.update_document(id=snippet_id, content=content, tag=tag, title=title, language=language)
    writer.commit()

    return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
Exemple #46
0
def scrape_selections(index):
    selections = []
    for (dirpath, _, file_names) in os.walk(ROSTER_DATA_DIR):
        for file_name in file_names:
            file_path = os.path.join(dirpath, file_name)
            _, ext = os.path.splitext(file_name)
            if ext != '.cat':
                print(f'Ignoring file: {file_path}')
                continue
            print(f'Parsing file: {file_path}')
            cat_file = RosterFile(file_path)
            selections += cat_file.scrape_selections()

    with index.writer() as index_writer:
        for selection in selections:
            index_writer.add_document(**selection)
    return selections
Exemple #47
0
def add_snippet():
    title = request.form['title'].strip()
    content = request.form['content'].strip()
    tag = request.form['tag']
    language = request.form['language']
    snippet_id = unicode(uuid.uuid4())

    if not title or not content:
        raise Exception("Empty title or snippet content")

    writer = index.writer()
    writer.update_document(id=snippet_id,
                           content=content,
                           tag=tag,
                           title=title,
                           language=language)
    writer.commit()

    return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
Exemple #48
0
def _after_flush(app, changes):
    # Any db updates go through here. We check if any of these models have
    # ``__searchable__`` fields, indicating they need to be indexed. With these
    # we update the whoosh index for the model. If no index exists, it will be
    # created here; this could impose a penalty on the initial commit of a
    # model.

    bytype = {}  # sort changes by type so we can use per-model writer
    for change in changes:
        update = change[1] in ('update', 'insert')

        if hasattr(change[0].__class__, __searchable__):
            bytype.setdefault(change[0].__class__.__name__, []).append(
                (update, change[0]))

    # for model, values in bytype.iteritems():
    for model, values in bytype.items():
        index = whoosh_index(app, values[0][1].__class__)
        with index.writer() as writer:
            primary_field = values[0][1].pure_whoosh.primary_key_name
            searchable = values[0][1].__searchable__

            for update, v in values:
                if update:
                    attrs = {}
                    for key in searchable:
                        try:
                            # attrs[key] = unicode(getattr(v, key))
                            attrs[key] = str(getattr(v, key))
                        except AttributeError:
                            raise AttributeError(
                                '{0} does not have {1} field {2}'.format(
                                    model, __searchable__, key))

                    # attrs[primary_field] = unicode(getattr(v, primary_field))
                    attrs[primary_field] = str(getattr(v, primary_field))
                    writer.update_document(**attrs)
                else:
                    # writer.delete_by_term(primary_field, unicode(getattr(v,
                    #     primary_field)))
                    writer.delete_by_term(primary_field,
                                          str(getattr(v, primary_field)))
Exemple #49
0
  def index_objects(self, objects):
    """
    Bulk index a list of objets, that must be not indexed yet, and all of the
    same class.
    """
    if not objects:
      return

    model_class = objects[0].__class__
    assert all(m.__class__ is model_class for m in objects),\
      "All objects must be of the same class."

    index = self.index_for_model_class(model_class)
    with index.writer() as writer:
      primary_field = model_class.search_query.primary
      indexed_fields = model_class.whoosh_schema.names()

      for model in objects:
        document = self.make_document(model, indexed_fields, primary_field)
        writer.add_document(**document)
Exemple #50
0
def create_index_writer(index_path):
    '''
    Constructs a whoosh index writer, which has an ID field as well as artist and title
    
    Input:
        index_path - Path to whoosh index to be written
    Output:
        index - Whoosh index writer
    '''
    if not os.path.exists(index_path):
        os.mkdir(index_path)

    A = whoosh.analysis.StemmingAnalyzer() | whoosh.analysis.CharsetFilter(accent_map)

    Schema = whoosh.fields.Schema(  track_id    =   whoosh.fields.ID(stored=True),
                                    artist      =   whoosh.fields.TEXT(stored=True, analyzer=A),
                                    title       =   whoosh.fields.TEXT(stored=True, analyzer=A))

    index = whoosh.index.create_in(index_path, Schema)
    return index.writer()
    def create_index(self):
        pdf_schema = Schema(id = ID(unique=True,stored=True),
                            path=ID(stored=True),
                            title=TEXT(stored=True),
                            text=TEXT,
                            textdata=TEXT(stored=True))
        if not os.path.exists(self.path_index):
            os.mkdir(self.path_index)
        index = create_in(self.path_index,pdf_schema)
        #index = open_dir('paper-index')

        paper_writer = index.writer()
        files_txt = [f for f in os.listdir(self.path_text) if f.endswith('.txt')]
        print('total paper: ', len(files_txt))
        for i, f in enumerate(files_txt):
            print('{}/{} - {}'.format(i, len(files_txt), f))
            paper_writer.add_document(id=f,
                                      textdata=open(os.path.join(self.path_text, f),encoding='utf-8').read()
                                      )
        paper_writer.commit()
Exemple #52
0
def build_hybrid_index(index, repo, ref='HEAD'):

    head = repo.refs[ref]

    def get_revisions(path):

        from posixpath import dirname

        return Walker(
            store=repo.object_store,
            include=[head],
            paths=[dirname(path)],
            follow=True,
        )

    head_pages_tree = git_storage.get_pages_tree(repo, ref)

    pages = git_storage.find_pages(repo, head_pages_tree)

    pages_data = git_storage.load_pages_with_attachments(repo, pages)

    w = index.writer()

    try:

        for path, page, attachments in pages_data:

            with w.group():

                write_page(repo, w, path, page, attachments)
                revisions = get_revisions(path)
                for revision in revisions:
                    write_revision(repo, w, revision.commit, path)

        w.commit(optimize=True)

    except:

        w.cancel()
        raise
Exemple #53
0
    def after_commit(self, session):
        """
    Any db updates go through here. We check if any of these models have
    ``__searchable__`` fields, indicating they need to be indexed. With these
    we update the whoosh index for the model. If no index exists, it will be
    created here; this could impose a penalty on the initial commit of a model.
    """

        if not self.running:
            return

        for typ, values in self.to_update.iteritems():
            model_class = values[0][1].__class__
            index = self.index_for_model_class(model_class)
            with index.writer() as writer:
                primary_field = model_class.search_query.primary
                indexed_fields = model_class.whoosh_schema.names()

                for change_type, model in values:
                    # delete everything. stuff that's updated or inserted will get
                    # added as a new doc. Could probably replace this with a whoosh
                    # update.

                    writer.delete_by_term(primary_field, unicode(getattr(model, primary_field)))

                    if change_type in ("new", "changed"):
                        attrs = {}
                        for key in indexed_fields:
                            value = getattr(model, key)
                            if hasattr(value, "name"):
                                value = value.name
                            if isinstance(value, str):
                                value = unicode(value)
                            elif isinstance(value, int):
                                value = unicode(value)
                            attrs[key] = value
                        attrs[primary_field] = unicode(getattr(model, primary_field))
                        writer.add_document(**attrs)

        self.to_update = {}
Exemple #54
0
def main(library_path, music_path):
    if os.path.exists(library_path):
        index = whoosh.index.open_dir(library_path)
    else:
        os.makedirs(library_path)
        index = whoosh.index.create_in(library_path, song_schema)

    songs = []
    with index.writer() as writer:
        for song in song_walker(os.path.expanduser(music_path)):
            # We index by the original text, but store parsed values in
            # our own index.
            writer.update_document(**song)

            for key, parser in key_parsers.items():
                if key in song:
                    song[key] = parser(song[key])
                    if song[key] is None:
                        del song[key]
            songs.append(song)

    pickle.dump(songs, open(os.path.join(library_path, 'songs'), 'w'))
def add_doc():
    index = storage.open_index(schema=schema)
    writer = index.writer()
    
    #parser = QueryParser(index_fieldname, schema=schema)
    #parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    #parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    #writer.delete_by_query(query)
    #writer.commit()

    content,names = get_content('qq7')
    
    doc = {}
    doc['id'] = _from_python(qq_id)
    doc[index_fieldname] = content
    
    try:
        writer.add_document(**doc)
        writer.commit()
        #writer.update_document(**doc)
    except Exception, e:
        raise
Exemple #56
0
def add_doc():
    index = storage.open_index(schema=schema)
    writer = index.writer()

    #parser = QueryParser(index_fieldname, schema=schema)
    #parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    #parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    #writer.delete_by_query(query)
    #writer.commit()

    content, names = get_content('qq7')

    doc = {}
    doc['id'] = _from_python(qq_id)
    doc[index_fieldname] = content

    try:
        writer.add_document(**doc)
        writer.commit()
        #writer.update_document(**doc)
    except Exception, e:
        raise
Exemple #57
0
def update_snippet(snippet_id):
    title = request.form['title']
    content = request.form['content']
    language = request.form['language']
    tag = request.form['tag']

    if not title or not content:
        raise Exception("Empty title or content")

    if not get_snippet_by_id(snippet_id):
        abort(
            make_response(
                '{"message": "The snippet you are trying to update doesn\'t exist"}',
                404))

    writer = index.writer()
    writer.update_document(id=snippet_id,
                           content=content,
                           tag=tag,
                           title=title,
                           language=language)
    writer.commit()

    return '{"success": true, "message": "Snippet added successfully", "snippet_id": "%s"}' % snippet_id
Exemple #58
0
    def update(self, force_rebuild=False):
        """ Adds/updates all items in repo to index. Note: querying will call this automatically."""

        # if we've already updated the index during this script run, we're done!
        if self.index_updated:
            return False

        # we only need to do this once per script lifetime
        self.index_updated = True

        # if the index is not based on the current commit, rebuild from scratch
        if not self.index_based_on_current_commit():
            force_rebuild = True

        if force_rebuild:
            # get a new clean/empty index
            index = self.get_index(force_rebuild)

            # disabled high-performance writer (https://pythonhosted.org/Whoosh/batch.html), causing thread/lock issues
            # index_writer = index.writer(procs=4, multisegment=True)
            index_writer = index.writer()

            # index all documents
            documents = self.document_iterator()
            activity_description = 'Rebuilding'

            # update indexed commit
            self.set_indexed_commit_hash()
        else:
            # use the current index
            index = self.get_index()
            index_writer = index.writer()

            # delete uncommitted files that are in index already
            for filepath in self.get_indexed_uncommitted_files():
                index_writer.delete_by_term('path', filepath)

            # get list of uncommitted files and persist it
            uncommitted_files = lib_git.get_uncommitted_oval()
            self.set_indexed_uncommitted_files(uncommitted_files)

            # nothing to update? done!
            if not uncommitted_files:
                index_writer.commit()
                return

            # index only uncommitted files
            documents = self.document_iterator(uncommitted_files)
            activity_description = 'Updating'

        # add all definition files to index
        counter = 0
        try:
            for document in documents:
                counter = counter + 1
                self.status_spinner(
                    counter, '{0} {1} index'.format(activity_description,
                                                    self.index_name),
                    self.item_label)
                if 'deleted' in document and document['deleted']:
                    try:
                        index_writer.delete_by_term(
                            'oval_id', self.whoosh_escape(document['oval_id']))
                    except:
                        self.message(
                            'debug',
                            'Something was marked as needing to be deleted but it wasnt in the index'
                        )
                    #self.message('debug', 'Deleting from index:\n\t{0} '.format(self.whoosh_escape(document['oval_id'])))
                    #index_writer.delete_by_term('oval_id', self.whoosh_escape(document['oval_id']))
                    #self.message('debug', 'Deleting from index:\n\t{0} '.format(self.whoosh_escape(document['oval_id'])))
                else:
                    index_writer.add_document(**document)
                    #self.message('debug', 'Upserting to index:\n\t{0} '.format(document['path']))
        except lib_xml.InvalidXmlError as e:
            # abort: cannot build index
            self.message(
                'ERROR CANNOT BUILD INDEX',
                'Invalid xml fragment\n\tFile: {0}\n\tMessage: {1}'.format(
                    e.path, e.message))
            self.message('ERROR', 'deleting index and aborting execution')
            index_writer.commit()
            self.index.close()
            shutil.rmtree(self.get_index_path())
            sys.exit(1)

        self.status_spinner(
            counter, '{0} {1} index'.format(activity_description,
                                            self.index_name), self.item_label,
            True)
        index_writer.commit()