Python FileIndex.FileIndex Examples

Programming Language: Python

Namespace/Package Name: whoosh.index

Class/Type: FileIndex

Method/Function: FileIndex

Examples at hotexamples.com: 8

Python FileIndex.FileIndex - 8 examples found. These are the top rated real world Python examples of whoosh.index.FileIndex.FileIndex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileIndex(8)

searcher(5)

create(4)

writer(3)

_reader(1)

reader(1)

refresh(1)

Example #1

Show file

File: add_index.py Project: renfanzi/haoyuheng_project

def incremental_index(indexdir, indexname, rowData):
    """
    注意这里，　每次增加索引都会产生一个新的ｓｅｇ文件，　会占用空间，　所以这里需要注意
    :param rowData: 每一行的数据
    :param indexdir:
    :param indexname:
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)
    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(json.dumps(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    writer.commit()

Example #2

Show file

File: managers.py Project: hankangkangjim/django-whoosh

 def __init__(self, *args, **kwargs):
     self.default = kwargs.pop("default", None)
     self.parser = None
     self.fields = kwargs.pop('fields', []) + ['id']
     self.real_time = kwargs.pop('real_time', True)
     if not os.path.lexists(STORAGE_DIR):
         os.makedirs(STORAGE_DIR)
     self.storage = filestore.FileStorage(STORAGE_DIR)
     try:
         self.index = FileIndex(self.storage)
     except (IndexError, EmptyIndexError):
         self.index = None
     super(WhooshManager, self).__init__(*args, **kwargs)

Example #3

Show file

    def __init__(self, corpus: Corpus, index_path: str, top_k,
                 extend_candidate_citations: bool):
        super().__init__(top_k)
        self.index_path = index_path

        storage = FileStorage(self.index_path, readonly=True)
        self._bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
        self.searcher = self._bm25_index.searcher(weighting=scoring.BM25F)
        self.query_parser = MultifieldParser(
            [FieldNames.TITLE, FieldNames.ABSTRACT],
            self._bm25_index.schema,
            group=qparser.OrGroup)
        self.corpus = corpus
        self.extend_candidate_citations = extend_candidate_citations

Example #4

Show file

    def init_indexes(self):
        """Create indexes for schemas."""
        state = self.app_state

        for name, schema in self.schemas.items():
            if current_app.testing:
                storage = TestingStorage()
            else:
                index_path = (Path(state.whoosh_base) / name).absolute()
                if not index_path.exists():
                    index_path.mkdir(parents=True)
                storage = FileStorage(text_type(index_path))

            if storage.index_exists(name):
                index = FileIndex(storage, schema, name)
            else:
                index = FileIndex.create(storage, schema, name)

            state.indexes[name] = index

Example #5

Show file

File: updateIndex.py Project: whigg/Crawling_Baidu_Academic

def update_index(indexdir, indexname, rowData):
    """
    注意这里，　每次增加索引都会产生一个新的ｓｅｇ文件，　会占用空间，　所以这里需要注意
    :param indexdir: 索引目录
    :param indexname: 索引名
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)

    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(str(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    # 其实就是在创建索引的时候要求唯一， 要不数据重复是追加了
    # Because "path" is marked as unique, calling update_document with path="/a"
    # will delete any existing documents where the "path" field contains "/a".
    # writer.update_document(path=u"/a", content="Replacement for the first document")
    writer.commit()

Example #6

Show file

File: retrieve_with_key_terms.py Project: kasys-lab/anserini-jp

    if args.valid_docs:
      data_type = 'pd'
      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))

    searcher = JSearcher(JString(args.index))
    searcher.setBM25Similarity(args.k1, args.b)
    print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
    if args.rm3:
        searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight)
        print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(args.fbTerms, args.fbDocs, args.originalQueryWeight))

    schema = Schema(title=TEXT,
                    abstract=TEXT,
                    id=ID(stored=True))
    storage = FileStorage(args.whoosh_index, readonly=True)
    bm25_index = FileIndex(copy_to_ram(storage), schema=schema)
    whoosh_searcher = bm25_index.searcher(weighting=scoring.BM25F)

    with open(args.output, 'w') as fout:
      start_time = time.time()
      for line_number, line in enumerate(open(args.qid_queries)):
          query_id, query = line.strip().split('\t')
          query = update_query_with_key_terms(query, whoosh_searcher)
          # We return one more result because it is almost certain that we will 
          # retrieve the document that originated the query.
          hits = searcher.search(
              JString(query.encode('utf8')), args.hits + 1)

          if line_number % 10 == 0:
              time_per_query = (time.time() - start_time) / (line_number + 1)
              print('Retrieving query {} ({:0.3f} s/query)'.format(

Example #7

Show file

File: gae.py Project: AyomP/dailyfresh

 def open_index(self, indexname=_DEF_INDEX_NAME, schema=None):
     return FileIndex(self, schema=schema, indexname=indexname)

Example #8

Show file

File: gae.py Project: AyomP/dailyfresh

    def create_index(self, schema, indexname=_DEF_INDEX_NAME):
        if self.readonly:
            raise ReadOnlyError

        TOC.create(self, schema, indexname)
        return FileIndex(self, schema, indexname)