コード例 #1
0
def createSearchableData(root):

    ana = analysis.StemmingAnalyzer()
    ## definisco lo schema del mio indice
    schema = Schema( title=TEXT(stored=True),\
                     author=TEXT(stored=True),\
                     genre=KEYWORD(stored=True), \
                     link=ID(stored=True), \
                     path=ID(stored=True), \
                     price=ID(stored=True), \
                     content=TEXT(stored=True),\
                     contentData=TEXT)

    ## creo la directory indexdir
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    cwd = os.getcwd()
    print(cwd)

    ## Creo un indexWriter, che aggiunga i documenti secondo lo schema
    ix = create_in("indexdir", schema)
    writer = AsyncWriter(ix)

    ## Trovo i file nella directory, e ne salvo i percorsi
    filepaths = [os.path.join(root, i) for i in os.listdir(root)]

    num = 1
    # per ogni percorso trovato...
    for path in filepaths:
        #print(num)
        num += 1

        fp = open(path, 'r', encoding="utf-8")
        #print(path)

        # Nella prima riga ho messo il titolo, nella seconda l'autore, nella terza il genere, nella quarta il link, nella quinta il prezzo
        fileTitle = fp.readline()
        fileAuthor = fp.readline()
        fileGenre = fp.readline()
        fileLink = fp.readline()
        filePrice = fp.readline()

        # Tutto il resto del file è occupato dalla trama
        filePlot = fp.read()

        # la sezione contentData è data dalle trame preprocessate
        fileData = tokenize(filePlot)

        ## Aggiungo un documento all'indice, con tutti i campi necessari
        writer.add_document( title = fileTitle,\
                             path = path,\
                             author = fileAuthor,\
                             genre = fileGenre,\
                             link = fileLink,\
                             price = filePrice, \
                             content = filePlot,\
                             contentData = fileData)
        fp.close()
    writer.commit()
コード例 #2
0
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(
                 content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("",
                                                        content,
                                                        count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page,
                 title=parseTitle(content, page),
                 content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags',
                                                         [])).strip())
             writer.commit()
         else:
             writer.update_document(path=page,
                                    title=parseTitle(content, page),
                                    content=content,
                                    tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
コード例 #3
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(filename)s %(levelname)s: %(asctime)s: %(message)s')
    logger = logging.getLogger('main')
    logger.info('Executing indexing module')
    logger.info('Reading file')
    du = doc_utilities()
    du.read_data_set(file='data/wikipedia_text_files.csv')
    logger.info('Task1 - Number of documents = {}'.format(
        du.get_number_documents()))
    du.process_documents_for_indexing()
    collection = du.get_collection_json()[0:1000000]

    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = index.create_in("indexdir", MySchema)
    #writer = ix.writer()
    writer = AsyncWriter(ix)
    with tqdm(total=len(collection),
              desc="Indexing documents",
              bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar:
        for d in collection:
            text = str(d['text'])
            idt = str(d['id'])
            title = str(d['title'])
            writer.add_document(id=idt, title=title, text=text)
            pbar.update(1)
    writer.commit()
コード例 #4
0
def delete_search_unit(pk, lang):
    try:
        for index in (get_source_index(), get_target_index(lang)):
            with AsyncWriter(index) as writer:
                writer.delete_by_term('pk', pk)
    except IOError:
        return
コード例 #5
0
ファイル: search.py プロジェクト: genomax/biostar-central
def index_posts(posts, ix=None, overwrite=False, add_func=add_index):
    """
    Create or update a search index of posts.
    """

    ix = ix or init_index()
    # The writer is asynchronous by default
    writer = AsyncWriter(ix)

    elapsed, progress = timer_func()
    total = posts.count()
    stream = islice(zip(count(1), posts), None)

    # Loop through posts and add to index
    for step, post in stream:
        progress(step, total=total, msg="posts indexed")
        add_func(post=post, writer=writer)

    # Commit to index
    if overwrite:
        logger.info("Overwriting the old index")
        writer.commit(mergetype=writing.CLEAR)
    else:
        logger.debug("Committing to index")
        writer.commit()

    elapsed(f"Committed {total} posts to index.")
コード例 #6
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            writer.update_document(**doc)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
コード例 #7
0
    def test_search_invalid_page(self):
        with AsyncWriter(index.open_index()) as writer:
            for i in range(15):
                doc = Document.objects.create(checksum=str(i),
                                              pk=i + 1,
                                              title=f"Document {i+1}",
                                              content="content")
                index.update_document(writer, doc)

        first_page = self.client.get(f"/api/search/?query=content&page=1").data
        second_page = self.client.get(
            f"/api/search/?query=content&page=2").data
        should_be_first_page_1 = self.client.get(
            f"/api/search/?query=content&page=0").data
        should_be_first_page_2 = self.client.get(
            f"/api/search/?query=content&page=dgfd").data
        should_be_first_page_3 = self.client.get(
            f"/api/search/?query=content&page=").data
        should_be_first_page_4 = self.client.get(
            f"/api/search/?query=content&page=-7868").data

        self.assertDictEqual(first_page, should_be_first_page_1)
        self.assertDictEqual(first_page, should_be_first_page_2)
        self.assertDictEqual(first_page, should_be_first_page_3)
        self.assertDictEqual(first_page, should_be_first_page_4)
        self.assertNotEqual(len(first_page['results']),
                            len(second_page['results']))
コード例 #8
0
    def saveToWhoosh(self, df, dataset_id, overwrite=False):
        # use whoosh search engine to enable full text search
        if not os.path.exists(self.whoosh_root):
            os.mkdir(self.whoosh_root)
        ws_path = os.path.join(self.whoosh_root, dataset_id)
        if not os.path.exists(ws_path):
            os.mkdir(ws_path)
            logMsg(
                str(os.path.abspath(ws_path)) +
                ' does not exist, create it to store whoosh index')
            overwrite = True
        elif overwrite:
            shutil.rmtree(ws_path)
            os.mkdir(ws_path)
        schema = Schema(DOC_ID=NUMERIC(stored=True), TEXT=TEXT)
        if overwrite:
            ix = create_in(ws_path, schema)
        else:
            ix = open_dir(ws_path)
        writer = AsyncWriter(ix)

        with self.workflow.dao.create_session() as session:
            doc_iter = session.query(Document).filter(
                Document.DATASET_ID == dataset_id)
            for doc in doc_iter:
                writer.add_document(DOC_ID=doc.DOC_ID, TEXT=doc.TEXT)
            writer.commit()
        pass
コード例 #9
0
ファイル: mikitree.py プロジェクト: gitter-badger/mikidown
    def delPage(self, item):

        index = item.childCount()
        while index > 0:
            index = index - 1
            self.dirname = item.child(index).text(0)
            self.delPage(item.child(index))

        # remove attachment folder
        attDir = self.itemToAttachmentDir(item)
        for info in QDir(attDir).entryInfoList():
            QDir().remove(info.absoluteFilePath())
        QDir().rmdir(attDir)

        pagePath = self.itemToPage(item)
        self.ix = open_dir(self.settings.indexdir)
        query = QueryParser('path', self.ix.schema).parse(pagePath)
        #writer = self.ix.writer()
        writer = AsyncWriter(self.ix)
        n = writer.delete_by_query(query)
        # n = writer.delete_by_term('path', pagePath)
        writer.commit()
        #self.ix.close()
        b = QDir(self.notePath).remove(self.pageToFile(pagePath))
        parent = item.parent()
        parentPage = self.itemToPage(parent)
        if parent is not None:
            index = parent.indexOfChild(item)
            parent.takeChild(index)
            if parent.childCount() == 0:  # if no child, dir not needed
                QDir(self.notePath).rmdir(parentPage)
        else:
            index = self.indexOfTopLevelItem(item)
            self.takeTopLevelItem(index)
        QDir(self.notePath).rmdir(pagePath)
コード例 #10
0
def handle_document(document_id):
    document = Document.objects.get(id=document_id)

    mime_type = document.mime_type

    parser_class = get_parser_class_for_mime_type(mime_type)

    parser = parser_class(logging_group=uuid.uuid4())

    try:
        parser.parse(document.source_path, mime_type)

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), 'rb') as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                # i'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # we also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum, content=parser.get_text())
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(),
                                document.archive_path)

        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, document)

    except Exception as e:
        logger.error(f"Error while parsing document {document}: {str(e)}")
    finally:
        parser.cleanup()
コード例 #11
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):

    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=False, analyzer=selected_analyzer),
                    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #writing index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.

    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

    writer.commit()
    in_file.close()  #finish writing in the index file
コード例 #13
0
def get_writer(index_name):
    ix = get(index_name)
    wr = AsyncWriter(ix)
    logger.debug("created index_writer for \"{}\" {}"
                 .format(index_name, id(wr)))

    return wr
コード例 #14
0
ファイル: test_api.py プロジェクト: zjean/paperless-ng
    def test_search_more_like(self):
        d1 = Document.objects.create(
            title="invoice",
            content="the thing i bought at a shop and paid with bank account",
            checksum="A",
            pk=1)
        d2 = Document.objects.create(title="bank statement 1",
                                     content="things i paid for in august",
                                     pk=2,
                                     checksum="B")
        d3 = Document.objects.create(title="bank statement 3",
                                     content="things i paid for in september",
                                     pk=3,
                                     checksum="C")
        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, d1)
            index.update_document(writer, d2)
            index.update_document(writer, d3)

        response = self.client.get(f"/api/search/?more_like={d2.id}")

        self.assertEqual(response.status_code, 200)

        results = response.data['results']

        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['id'], d3.id)
        self.assertEqual(results[1]['id'], d1.id)
コード例 #15
0
 def update(self, x, who=None):
     # implement search here
     x = str(x)
     aindex = AsyncWriter(self.index, delay=0.2)
     aindex.add_document(content=x)
     aindex.commit()
     return self._emit(x)
コード例 #16
0
 def build_index_writer(self, ix):
     try:
         writer = AsyncWriter(ix)
         self.build_src_index(writer, "java")
         writer.commit()
     except IndexingError as ie:
         print ie.message + "index Error!!!"
コード例 #17
0
 def whoosh_index(self):
     it = QtWidgets.QTreeWidgetItemIterator(
         self.notesTree, QtWidgets.QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
コード例 #18
0
def incremental_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param rowData: 每一行的数据
    :param indexdir:
    :param indexname:
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)
    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(str(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    writer.commit()
コード例 #19
0
ファイル: search.py プロジェクト: jvalleroy/weblate
def update_index_unit(unit):
    """Add single unit to index."""
    # Should this happen in background?
    if settings.OFFLOAD_INDEXING:
        add_index_update(unit.id, False, unit.translation.language.code)
        return

    # Update source
    index = get_source_index()
    with AsyncWriter(index) as writer:
        update_source_unit_index(writer, unit)

    # Update target
    if unit.target:
        index = get_target_index(unit.translation.language.code)
        with AsyncWriter(index) as writer:
            update_target_unit_index(writer, unit)
コード例 #20
0
def index_app(path):
    print("started indexing file " + path)
    ix = get_index()
    writer = AsyncWriter(ix)
    process_appzip(writer, path)
    writer.commit()
    ix.close()
    print("indexing completed " + path)
コード例 #21
0
def index_reindex(progress_bar_disable=False):
    documents = Document.objects.all()

    ix = index.open_index(recreate=True)

    with AsyncWriter(ix) as writer:
        for document in tqdm.tqdm(documents, disable=progress_bar_disable):
            index.update_document(writer, document)
コード例 #22
0
 def add_item(self, item):
     model = self.model
     doc = self._create_document(model, item)
     index = self.model_index
     writer = AsyncWriter(index, writerargs=self._writer_args())
     writer.update_document(**doc)
     writer.commit()
     self._close_model_index()
コード例 #23
0
ファイル: tasks.py プロジェクト: ybotmallah/paperless-ng
def index_reindex():
    documents = Document.objects.all()

    ix = index.open_index(recreate=True)

    with AsyncWriter(ix) as writer:
        for document in tqdm.tqdm(documents):
            index.update_document(writer, document)
コード例 #24
0
 def reindex(self, instance: Model):
     """Update an entry in the index. Non-blocking.
     :param instance: instance of ``self.model`` that needs reindexing (because it changed or
     was added)
     """
     """Update an entry in the index. Non-blocking."""
     with AsyncWriter(self.index) as writer:
         writer.update_document(**self._extract_search_fields(instance))
コード例 #25
0
ファイル: search.py プロジェクト: sunner/weblate
 def delete_search_unit(self, pk, lang):
     try:
         indexes = (self.get_source_index(), self.get_target_index(lang))
         for index in indexes:
             with AsyncWriter(index) as writer:
                 writer.delete_by_term('pk', pk)
     except IOError:
         return
コード例 #26
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__class__.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

                # reset the writer so there is no 'start_doc' error from the
                # previous failed update attempt
                writer = AsyncWriter(self.index)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
コード例 #27
0
ファイル: shed_index.py プロジェクト: msauria/galaxy
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi, **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path, dburi, engine_options={}, create_tables=False)
    sa_session = model.session
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get('full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
コード例 #28
0
ファイル: whoosh.py プロジェクト: Polyconseil/dokang
 def delete_documents(self, doc_set, paths):
     """Delete documents from the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     query = And(
         [Term('set', doc_set),
          Or([Term('path', path) for path in paths])])
     writer.delete_by_query(query)
     writer.commit()
コード例 #29
0
def add_spam(post):

    ix = init_spam_index()
    writer = AsyncWriter(ix)
    add_post_to_index(post=post, writer=writer)
    writer.commit()
    logger.info("Added spam to index.")

    return
コード例 #30
0
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()