コード例 #1
19
ファイル: whoosh.py プロジェクト: pombredanne/dokang
 def delete_documents(self, doc_set, paths):
     """Delete documents from the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     query = And([
         Term('set', doc_set),
         Or([Term('path', path) for path in paths])
     ])
     writer.delete_by_query(query)
     writer.commit()
コード例 #2
0
def add():
    d = request.get_json(force=True)
    url = d.get("url")
    content = d.get("content")
    if not url or not content: return jsonify({"status": "missing parameters"})
    if urlparse.urlparse(url).netloc.startswith("localhost"): return  jsonify({"status": "ignored"})
    ix = get_index()
    writer = AsyncWriter(ix)
    soup = BeautifulSoup(content)
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    writer.update_document(title=d.get("title", "Untitled"),
        url=url,
        content=text,
        modified=datetime.datetime.now())
    writer.commit()
    return jsonify({"status": "ok"})
コード例 #3
0
ファイル: whoosh_backend.py プロジェクト: stden/bonushouse
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
コード例 #4
0
ファイル: server.py プロジェクト: DSrcl/stunning-octo-waffle
def store_page(user, url):
    writer = AsyncWriter(idx)
    resp = requests.get(url)
    content = parse(resp.content)
    now = datetime.now()
    writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content)
    writer.commit()
コード例 #5
0
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         writer.update_document(**doc)
     
     if len(iterable) > 0:
         # For now, commit no matter what, as we run into locking issues otherwise.
         writer.commit()
         
         # If spelling support is desired, add to the dictionary.
         if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
             sp = SpellChecker(self.storage)
             sp.add_field(self.index, self.content_field_name)
コード例 #6
0
def incremental_index(t, l, c, dirname):
    id = (Searcher().getcount() + 1)
    ix = index.open_dir(dirname)
    # The set of all paths in the index
    #with ix.searcher() as searcher:

    indexed_feeds = set()

    with ix.searcher() as searcher:
      writer = AsyncWriter(ix)

      # Loop over the stored fields in the index
      for fields in searcher.all_stored_fields():
        indexed_feed = fields['title']
        indexed_feeds.add(indexed_feed)

      # Loop over the files in the filesystem
      # Assume we have a function that gathers the filenames of the
      # documents to be indexed
      if t not in indexed_feeds:
          # This is either a file that's changed, or a new file
          # that wasn't indexed before. So index it!
          wooshDocuments(id, writer, t, l, c)

      writer.commit()
      return id
コード例 #7
0
ファイル: mikitree.py プロジェクト: kekh/mikidown
    def delPage(self, item):

        index = item.childCount()
        while index > 0:
            index = index - 1
            self.dirname = item.child(index).text(0)
            self.delPage(item.child(index))

        # remove attachment folder
        attDir = self.itemToAttachmentDir(item)
        for info in QtCore.QDir(attDir).entryInfoList():
            QtCore.QDir().remove(info.absoluteFilePath())
        QtCore.QDir().rmdir(attDir)

        pagePath = self.itemToPage(item)
        self.ix = open_dir(self.settings.indexdir)
        query = QueryParser("path", self.ix.schema).parse(pagePath)
        # writer = self.ix.writer()
        writer = AsyncWriter(self.ix)
        n = writer.delete_by_query(query)
        # n = writer.delete_by_term('path', pagePath)
        writer.commit()
        # self.ix.close()
        b = QtCore.QDir(self.notePath).remove(self.pageToFile(pagePath))
        parent = item.parent()
        parentPage = self.itemToPage(parent)
        if parent is not None:
            index = parent.indexOfChild(item)
            parent.takeChild(index)
            if parent.childCount() == 0:  # if no child, dir not needed
                QtCore.QDir(self.notePath).rmdir(parentPage)
        else:
            index = self.indexOfTopLevelItem(item)
            self.takeTopLevelItem(index)
        QtCore.QDir(self.notePath).rmdir(pagePath)
コード例 #8
0
ファイル: base.py プロジェクト: niwinz/needlestack
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
コード例 #9
0
ファイル: base.py プロジェクト: niwinz/needlestack
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc)
                                for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
コード例 #10
0
 def addLink(self, url, title, summary, txt):
     
     titleb = title + " "
     title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
     sumario = summary + " "
     sumario2 = sumario + sumario
     text = title10 + sumario2 + " " + txt
     
     ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
     writer = AsyncWriter(ix)
     writer.add_document(id=url, content=unicode(text)) 
     writer.commit()
     ix.close()
コード例 #11
0
ファイル: tasks.py プロジェクト: BiaoLiu/videoSpider
def whoosh_task(ids, pool_number, ix, model_class):
    session = sqla['session']

    writer = AsyncWriter(ix)
    for id_ in ids:
        obj = session.query(model_class).filter_by(id=id_).one()
        if obj.title is None or obj.summary is None:
            continue

        writer.add_document(
            title=obj.title,
            summary=obj.summary
        )

    writer.commit()
コード例 #12
0
ファイル: mikiwindow.py プロジェクト: OSUser/mikidown
 def whoosh_index(self):
     it = QTreeWidgetItemIterator(
         self.notesTree, QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
コード例 #13
0
ファイル: whoosh.py プロジェクト: pombredanne/dokang
 def index_documents(self, documents):
     """Add or update documents in the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     needs_commit = False
     for document in documents:
         needs_commit = True
         writer.update_document(
             uid=':'.join((document['set'], document['path'])),
             path=document['path'],
             set=document['set'],
             hash=document['hash'],
             title=document['title'],
             content=document['content'],
             kind=document['kind'],
         )
     if needs_commit:
         writer.commit()
コード例 #14
0
ファイル: service.py プロジェクト: abilian/abilian-core
    def clear(self):
        """Remove all content from indexes, and unregister all classes.

        After clear() the service is stopped. It must be started again
        to create new indexes and register classes.
        """
        logger.info("Resetting indexes")
        state = self.app_state

        for _name, idx in state.indexes.items():
            writer = AsyncWriter(idx)
            writer.commit(merge=True, optimize=True, mergetype=CLEAR)

        state.indexes.clear()
        state.indexed_classes.clear()
        state.indexed_fqcn.clear()
        self.clear_update_queue()

        if self.running:
            self.stop()
コード例 #15
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "      
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return
        
        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()
        
        linkN = 1
        schema = Schema(id = TEXT(stored = True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:
            
            # Descartar links sem Titulo
            if( isinstance(feed[3], type(None))):
                #print "is Null"
                continue
            
            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1
            
            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " +feed[5]
            
            writer.add_document(id=index, content=unicode(text))
            
            
        writer.commit()
        ix.close()   
        print "    Done Loading from SQL"
コード例 #16
0
ファイル: whoosh_cn_backend.py プロジェクト: caitinggui/Blog
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
コード例 #17
0
ファイル: mikitree.py プロジェクト: albfan/mikidown
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath+newPageName, content=content)
            writer.commit()
コード例 #18
0
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         try:
             writer.update_document(**doc)
         except Exception, e:
             if not self.silently_fail:
                 raise
             
             self.log.error("Failed to add documents to Whoosh: %s", e)
コード例 #19
0
ファイル: mikiedit.py プロジェクト: albfan/mikidown
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()        
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
             writer.commit()
         else:
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=content, tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
コード例 #20
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       "data": {
                                           "index": index,
                                           "object": get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
コード例 #21
0
def handle_document(document_id):
    document = Document.objects.get(id=document_id)

    mime_type = document.mime_type

    parser_class = get_parser_class_for_mime_type(mime_type)

    parser = parser_class(logging_group=uuid.uuid4())

    try:
        parser.parse(document.source_path, mime_type)

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), 'rb') as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                # i'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # we also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
                    content=parser.get_text()
                )
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(),
                                document.archive_path)

        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, document)

    except Exception as e:
        logger.error(f"Error while parsing document {document}: {str(e)}")
    finally:
        parser.cleanup()
コード例 #22
0
class SearchPipeline(object):
    cleanup = False

    def open_spider(self, spider):
        """ When opening spider, open or create index. """

        index_dir = os.path.expanduser('~/.sitesearcher/index')
        if not os.path.exists(index_dir):
            os.makedirs(index_dir)

        self.indexname = spider.allowed_domains[0]
        if index.exists_in(index_dir, indexname=self.indexname):
            self.index = index.open_dir(index_dir, indexname=self.indexname)
        else:
            self.index = index.create_in(
                index_dir,
                indexname=self.indexname,
                schema=schema,
            )
        self.writer = AsyncWriter(self.index)

    def process_item(self, item, spider):
        """ Add crawled item to index.

        Add items using ``update_document`` to delete any previously indexed
        versions and avoid duplicates
        """

        self.writer.update_document(
            url=item.get('url'), content=item.get('content'))

    def close_spider(self, spider):
        """ Close index writer on closing of spider an clean up.

        On closing, delete any previously indexed items that have not been
        updated in this crawl, as these are obviously no longer reachable sites.
        """

        with self.index.searcher() as searcher:
            for page in searcher.all_stored_fields():
                if page['url'] not in spider.state['update_list']:
                    self.writer.delete_by_term('url', page['url'])
        self.writer.commit()
コード例 #23
0
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):
    """
    Method that creates schema and stores index file based on the retrieved 'csv_test.csv' file  
    input:  
        selected_analyzer - selected text analyzer from the whoosh library
        name_of_file - name of .csv file stored from dataframe variable 'files_text'
        scoring_function - selected scoring function from the whoosh library
        path - path where index files are stored
    """
    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),\
    title=TEXT(stored=False, analyzer=selected_analyzer),
    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #vrating index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    #num_added_records_so_far=0
    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.
    #num_added_records_so_far = 0
    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

#num_added_records_so_far +=1
#if (num_added_records_so_far%1000 == 0):
#    print(" num_added_records_so_far= " + str(num_added_records_so_far))

    writer.commit()
    in_file.close()  #finish writing in the index file
コード例 #24
0
    def delete(self, name, purge=True):
        """
        Delete a document by its name. name is actually a hash. If purge is true, file is also
        removed from the boxes.
        """
        # Grab a writer on the index
        writer = AsyncWriter(self.index)

        # Delete and commit ffom index
        writer.delete_by_term(u'hash', name)
        writer.commit()

        # Delete the document from the boxes if we want to purge them
        if not purge:
            return

        # We need to remove the doc is box is writable
        for box in self.boxes:
            if box.haskey(name) and not box.readonly:
                del (box[name])
コード例 #25
0
ファイル: mikitree.py プロジェクト: rockray/mikidown
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath,
                                self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
コード例 #26
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return

        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()

        linkN = 1
        schema = Schema(id=TEXT(stored=True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:

            # Descartar links sem Titulo
            if (isinstance(feed[3], type(None))):
                #print "is Null"
                continue

            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1

            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " + feed[5]

            writer.add_document(id=index, content=unicode(text))

        writer.commit()
        ix.close()
        print "    Done Loading from SQL"
コード例 #27
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        write = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object '%s' skipped", obj)
            else:
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                if 'boost' in doc:
                    del doc['boost']

                try:
                    write.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       'data': {
                                           'index': index,
                                           'object': get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            write.commit()
コード例 #28
0
ファイル: backend.py プロジェクト: tjwalch/wagtail-whoosh
    def add_items(self, model, objs):
        for obj in objs:
            obj._body_ = self.prepare_body(obj)

        self._delete_parent_model_data(model, objs)

        index = self.backend.index.refresh()
        writer = AsyncWriter(index)

        for obj in objs:
            doc = {
                ID: get_identifier(obj),
                DJANGO_CT: get_model_ct(obj),
                DJANGO_ID: force_text(obj.pk),
                'text': force_text(obj._body_),
            }

            try:
                writer.update_document(**doc)
            except Exception as e:
                raise e

        if len(objs) > 0:
            writer.commit()
コード例 #29
0
custom_stops=['rt','ht','mt','@','#','!',':',';',',','.',"'s","?","\\n",'http','https',"n't","&","\\",'...','-','"']
stops=list(set(default_stops+custom_stops))

#Set up schema fields
my_schema = Schema(id = ID(unique=True, stored=True),
                    text = TEXT(stored=True),
                    contains_retweet= BOOLEAN(stored=True),
                    screen_name = TEXT(stored=True),
                    keyword=KEYWORD(stored=True),
                    created=DATETIME(stored=True)
                    )


#Create index and AsyncWriter object
index = create_in("tweetindex", my_schema)
writer = AsyncWriter(index)

if __name__=='__main__':
    #Load raw data
    with open("WC2015_headers.csv",'rb') as to_load:
        data=csv.DictReader(to_load)
        for row in data:
            #Extract required information from date to create python datetime object
            date=row['created_at'][:19]+' '+row['created_at'][-4:]
            
            #Clean text and parse into keywords
            text=row['text'].replace('\\','')
            keywords=[word for word in word_tokenize(text) if word not in stops]
            
            #Check for Retweets
            rt=False
コード例 #30
0
ファイル: index.py プロジェクト: magne4000/ftpvista
class Index(object):
    def __init__(self, directory, persist):
        self.log = logging.getLogger("ftpvista.index")

        self._persist = persist
        if not os.path.exists(directory):
            self.log.info("Creating the index in %s" % directory)
            os.mkdir(directory)
            self._idx = index.create_in(directory, schema=self.get_schema())
        else:
            self.log.info("Opening the index in %s" % directory)
            self._idx = index.open_dir(directory)

        self._searcher = self._idx.searcher()
        self._writer = None
        self.open_writer()
        self._last_optimization = None

    def open_writer(self):
        # self._writer = BufferedWriter(self._idx, 120, 4000)
        self._writer = AsyncWriter(self._idx)

    def get_schema(self):
        analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
        my_analyzer = analyzer | CharsetFilter(accent_map)
        return Schema(
            server_id=ID(stored=True),
            has_id=ID(),
            path=TEXT(analyzer=my_analyzer, stored=True),
            name=TEXT(analyzer=my_analyzer, stored=True),
            ext=TEXT(analyzer=my_analyzer, stored=True),
            size=ID(stored=True),
            mtime=ID(stored=True, sortable=True),
            audio_album=TEXT(analyzer=my_analyzer, stored=True),
            audio_artist=TEXT(analyzer=my_analyzer, stored=True),
            audio_title=TEXT(analyzer=my_analyzer, stored=True),
            audio_track=ID(stored=True),
            audio_year=ID(stored=True),
        )

    def delete_all_docs(self, server):
        self.open_writer()
        self._writer.delete_by_term("server_id", str(server.get_server_id()))
        self._writer.commit()
        self.log.info("All documents of server %s deleted" % server.get_ip_addr())

    def incremental_server_update(self, server_id, current_files):
        """Prepares to incrementaly update the documents for the given server.

        server_id      -- Id of the server to update.
        current_files  -- a list of (path, size, mtime) tuples for each files
                          currently on the server.

        Delete all the outdated files from the index and returns a list
        of files needing to be reindexed.
        """

        def delete_doc(writer, serverid, path):
            writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))

        # Build a {path => (size, mtime)} mapping for quick lookups
        to_index = {}
        for path, size, mtime in current_files:
            to_index[path] = (size, mtime)

        results = self._searcher.documents(server_id=str(server_id))
        if results:
            for fields in results:
                indexed_path = fields["path"]

                if indexed_path not in to_index:
                    # This file was deleted from the server since it was indexed
                    delete_doc(self._writer, server_id, indexed_path)
                    self.log.debug("%s has been removed" % indexed_path)
                else:
                    size, mtime = to_index[indexed_path]
                    try:
                        if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
                            # This file has been modified since it was indexed
                            delete_doc(self._writer, server_id, indexed_path)
                        else:
                            # up to date, no need to reindex
                            del to_index[indexed_path]
                    except ValueError:
                        delete_doc(self._writer, server_id, indexed_path)

        # return the remaining files
        return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]

    def add_document(
        self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
    ):
        """Add a document with the specified fields in the index.

        Changes need to be commited.

        """

        # passing the optional arguments is quite a mess
        # let's build a dict for that purpose

        _, ext = os.path.splitext(name)
        ext = ext.lstrip(".")

        kwargs = {
            "server_id": server_id,
            "name": name,
            "ext": ext,
            "path": path,
            "size": size,
            "mtime": mtime,
            "has_id": "a",
        }

        # Add the optional args
        if audio_album is not None:
            kwargs["audio_album"] = audio_album

        if audio_artist is not None:
            kwargs["audio_artist"] = audio_artist

        if audio_title is not None:
            kwargs["audio_title"] = audio_title

        if audio_year is not None:
            kwargs["audio_year"] = audio_year

        try:
            self._writer.add_document(**kwargs)
        except IndexingError:
            self.open_writer()
            self._writer.add_document(**kwargs)

    def commit(self, optimize=False):
        """ Commit the changes in the index and optimize it """
        self.log.info(" -- Begin of Commit -- ")
        try:
            self._writer.commit(optimize=optimize)
        except IndexingError:
            self.open_writer()
            self._writer.commit(optimize=optimize)
        self.log.info("Index commited")

        self._searcher = self._idx.searcher()
        self.log.info(" -- End of Commit -- ")

    def close(self):
        self.log.info(" -- Closing writer and index -- ")
        # self._writer.close()
        """ Close the index """
        self._idx.close()
コード例 #31
0
 def add(self, note):
     writer = AsyncWriter(self.index)
     writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.commit()
コード例 #32
0
ファイル: __init__.py プロジェクト: sw00/pixelated-user-agent
 def index_mail(self, mail):
     with AsyncWriter(self._index) as writer:
         self._index_mail(writer, mail)
コード例 #33
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
コード例 #34
0
ファイル: z_whoosh.py プロジェクト: ybenitezf/nstock
 def remove(self, item_id):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.delete_by_term('id', item_id)
     writer.commit()
コード例 #35
0
 def get_writer(self):
     return AsyncWriter(self.index)
コード例 #36
0
def insert_docs(docs):
    ix = open_dir(whoosh_index)
    writer = AsyncWriter(ix)
    for doc in docs:
        writer.add_document(**doc)
    writer.commit()
コード例 #37
0
ファイル: feeds.py プロジェクト: SimonCoopey/bottle-fever
                            'html'   : html,
                            'url'    : entry.link,
                            'tags'   : get_entry_tags(entry),
                            'when'   : when})
        
        if not len(entries):
            return

        log.debug("%s - %d entries in %fs" % (netloc, len(entries),time.time()-now))
        now = time.time()
        
        
        records = 0
        now = time.time()
        ix = open_dir(settings.index)
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid = entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(
                    id = item.id,
                    guid = unicode(item.guid),
                    title = entry['title'],
コード例 #38
0
    def load_all_dset_metadata(self, dsetname, create_index=False):
        """
            Loads into memory the metadata of a dataset. The metadata is read from a CSV file, which should
            have at least two columns:
             - filename: Paths to the images in the dataset, relative to the image data folder. For backward
                         compatibility '#filename' is also accepted
             - file_attributes: JSON string containing information about the file. The most important file
                                attributes are 'caption' and 'keywords'. The 'caption' field should be a short
                                string which will be used as the caption of the image in result lists. The
                                'keywords' field must contain a comma-separated list of keywords. Each keyword
                                can be used as the source for a search.
            If create_index is True, it builds a search index with the 'keywords' in the file_attributes.
            Arguments:
                dsetname: String corresponding to the dataset within the list of supported
                          datasets.
                create_index: Boolean indicating whether or not to build a search index
                              with the metadata
        """
        metaindex = None
        t = time.time()
        try:
            for afile in os.listdir(os.path.join(self.metadata_dir, dsetname)):
                if afile.endswith(".csv"):
                    metadata_file = os.path.join(self.metadata_dir, dsetname,
                                                 afile)
                    print('Found metadata file at', metadata_file)
                    if create_index:
                        metaindex = open_dir(self.index_dir)
                    with open(metadata_file, 'r') as fin:
                        reader = csv.DictReader(fin)
                        for row in reader:
                            id_field = None
                            if 'filename' in row.keys():
                                id_field = 'filename'
                            elif '#filename' in row.keys():
                                id_field = '#filename'
                            if id_field and 'file_attributes' in row.keys():
                                filename = row[id_field]
                                try:
                                    self.fname2meta[dsetname][
                                        filename] = json.loads(
                                            row['file_attributes'])
                                except:
                                    self.fname2meta[dsetname][filename] = None
                                metadata = self.fname2meta[dsetname][filename]
                                keyword_list = None
                                if metadata and 'keywords' in metadata.keys():
                                    keyword_list = metadata['keywords']
                                if keyword_list and create_index:
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    writer = AsyncWriter(metaindex)
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        # delete previous entry if found
                                        query = QueryParser(
                                            'key', metaindex.schema).parse(key)
                                        writer.delete_by_query(
                                            query, metaindex.searcher())
                                        # add document
                                        writer.add_document(
                                            key=str(key),
                                            dataset=str(dsetname))
                                    writer.commit()
                                if keyword_list:  # we would like to do this, even if the index is not created
                                    # register link keyword-file
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        if key in self.keyword2fname[
                                                dsetname].keys():
                                            self.keyword2fname[dsetname][
                                                key].append(filename)
                                        else:
                                            self.keyword2fname[dsetname][
                                                key] = [filename]
                            else:
                                raise Exception(
                                    '"filename" and/or "file_attributes" columns not found in '
                                    + afile +
                                    ' (are you missing the column names?). Metadata will not be available!.'
                                )

                        print('Finished loading metadata for %s in %s' %
                              (dsetname, str(time.time() - t)))
                        self.is_all_metadata_loaded = True
                    break
        except Exception as e:
            print("load_all_dset_metadata Exception:" + str(e) + '\n')
コード例 #39
0
 def index_mail(self, mail):
     if mail is not None:
         with AsyncWriter(self._index) as writer:
             self._index_mail(writer, mail)
コード例 #40
0
 def remove(self, instance: Model):
     """Remove an entry from the index. Non-blocking.
     :param instance: instance of ``self.model`` to be removed from the index
     """
     with AsyncWriter(self.index) as writer:
         writer.delete_by_term(self.pk_name, getattr(instance, self.pk_name))
コード例 #41
0
        i = 0
        line = docs.readline()
        pbar = tqdm(total=3_213_835)
        while line != "":
            _, url, _, _ = line.split("\t")
            writer.update_document(url_text=url.replace(".", " "))
            line = docs.readline()
            i += 1
            pbar.update(1)
    writer.commit()
    exit(0)

ix.writer().commit(mergetype=writing.CLEAR)

print(f"Loading documents from {args.data}")
writers = [AsyncWriter(ix) for _ in range(args.threads)]
with open(args.data, "r", encoding="utf-8") as docs:
    i = 0
    line = docs.readline()
    pbar = tqdm(
        total=args.num_docs if args.num_docs is not None else 3_213_835)
    while line != "" and (args.num_docs is None or i < args.num_docs):
        docid, url, title, body = line.split("\t")
        writers[i % args.threads].add_document(docid=docid,
                                               url=url,
                                               title=title,
                                               body=body)
        line = docs.readline()
        i += 1
        pbar.update(1)
pbar.set_description("Committing...")
コード例 #42
0
ファイル: database.py プロジェクト: gamecip/citetool-editor
 def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None):
     ix = open_dir(LOCAL_FTS_INDEX)
     writer = AsyncWriter(ix)
     writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags)
     writer.commit()
コード例 #43
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = "{}:{}".format(cls_name, pk)
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except BaseException:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
コード例 #44
0
 def insert(self, link, title, document):
     writer = AsyncWriter(self.ix)
     writer.add_document(link=link,title=title, document=document + title)
     writer.commit()
コード例 #45
0
ファイル: z_whoosh.py プロジェクト: ybenitezf/nstock
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
コード例 #46
0
ファイル: fulltext.py プロジェクト: wrestcody/Bookie
def get_writer():
    global WIX
    writer = AsyncWriter(WIX)
    return writer
コード例 #47
0
    t2 = time()
    print 'Years', '-'.join(years)

    print 'Restarting database...'
    #	db=create_engine(connection_string,pool_size=proc_num)
    engine = db.connect()
    metadata = MetaData(engine)
    inspector = inspect(engine)
    #os.system('mongod -f /etc/mongodb.conf --shutdown')
    #os.system('mongod -f /etc/mongodb.conf &')
    print 'Done!'
    print

    #client=MongoClient()
    #db=client['pubmed']
    writer = AsyncWriter(index)

    for year in years:
        #collection=db[year]
        #total=collection.count()
        table_name = 'pubmed_sent_' + year
        table = Table(table_name, metadata, autoload=True)
        statement = table.count()
        total = engine.execute(statement).fetchone()[0]
        print 'Doc count', str(total)
        print

        t = time()
        num = 0
        #for post in collection.find():
        statement = table.select()
コード例 #48
0
ファイル: tasks.py プロジェクト: ybotmallah/paperless-ng
def index_optimize():
    ix = index.open_index()
    writer = AsyncWriter(ix)
    writer.commit(optimize=True)
コード例 #49
0
 def optimize(self):
     writer = AsyncWriter(self.index)
     writer.commit(optimize=True)
コード例 #50
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir,
                              **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get(
                        'full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
コード例 #51
0
ファイル: __init__.py プロジェクト: sw00/pixelated-user-agent
 def remove_from_index(self, mail_id):
     with AsyncWriter(self._index) as writer:
         writer.delete_by_term('ident', mail_id)
コード例 #52
0
 def get_writer(indexname=None, schema=None):
     return AsyncWriter(Index.get_index(indexname=indexname, schema=schema))
コード例 #53
0
 def search(self, query_string, notebook_id=None):
     with AsyncWriter(self.index).searcher() as searcher:
         query_parser = MultifieldParser(["title", "snippet"], schema=self.index.schema).parse(query_string)
         notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None
         results = searcher.search(query_parser, filter=notebook_filter, limit=None)
         return [res['note_id'] for res in results]
コード例 #54
0
def get_writer(ix):
    writer = AsyncWriter(ix)
    # writer = ix.writer()
    return writer
コード例 #55
0
ファイル: index.py プロジェクト: magne4000/ftpvista
 def open_writer(self):
     # self._writer = BufferedWriter(self._idx, 120, 4000)
     self._writer = AsyncWriter(self._idx)
コード例 #56
0
ファイル: handler.py プロジェクト: tridentstream/mediaserver
 def get_writer(self, parent_path):
     logger.debug(f"Getting index writer for path:{parent_path}")
     return WhooshPathIndexer(AsyncWriter(self.ix), parent_path)
コード例 #57
0
ファイル: service.py プロジェクト: abilian/abilian-core
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = f"{cls_name}:{pk}"
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except Exception:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
コード例 #58
0
ファイル: mikitree.py プロジェクト: kekh/mikidown
    def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/")
        if not newPageName:
            if useTemplate:
                dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self)
                if dialog.exec_():
                    curTitleIdx = dialog.titleTemplates.currentIndex()
                    curBodyIdx = dialog.bodyTemplates.currentIndex()
                    dtnow = datetime.datetime.now()
                    if curTitleIdx > -1:
                        titleItem = dialog.titleTemplates.model().item(curTitleIdx)
                        titleItemContent = titleItem.data(TTPL_COL_DATA)
                        titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA)
                        titleParameter = dialog.titleTemplateParameter.text()
                        newPageName = mikitemplate.makeTemplateTitle(
                            titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter
                        )
                    if curBodyIdx > -1:
                        bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0)
                        bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx)
                    else:
                        bodyFPath = None
            else:
                dialog = LineEditDialog(pagePath, self)
                if dialog.exec_():
                    newPageName = dialog.editor.text()

        prevparitem = None

        if newPageName:
            if hasattr(item, "text"):
                pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/")
            if not QtCore.QDir(pagePath).exists():
                QtCore.QDir(self.notePath).mkdir(pagePath)

            if not QtCore.QDir(os.path.dirname(newPageName)).exists():
                curdirname = os.path.dirname(newPageName)
                needed_parents = []
                while curdirname != "":
                    needed_parents.append(curdirname)
                    curdirname = os.path.dirname(curdirname)

                # create the needed hierarchy in reverse order
                for i, needed_parent in enumerate(needed_parents[::-1]):
                    paritem = self.pageToItem(needed_parent)
                    if paritem is None:
                        if i == 0:
                            self.newPageCore(item, os.path.basename(needed_parent))
                        else:
                            self.newPageCore(prevparitem, os.path.basename(needed_parent))
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists():
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    if paritem is not None:
                        prevparitem = paritem
                    else:
                        prevparitem = self.pageToItem(needed_parent)

            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QtCore.QFile(fileName)
            fh.open(QtCore.QIODevice.WriteOnly)

            savestream = QtCore.QTextStream(fh)
            if useTemplate and bodyFPath is not None:
                with open(bodyFPath, "r", encoding="utf-8") as templatef:
                    savestream << mikitemplate.makeTemplateBody(
                        os.path.basename(newPageName),
                        dtnow=dtnow,
                        dt_in_body_txt=self.tr("Created {}"),
                        body=templatef.read(),
                    )
            else:
                savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}"))
            fh.close()
            if prevparitem is not None:
                QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)])
            else:
                QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, "text"):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QtCore.QDir(attDir).exists():
                QtCore.QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            with open(fileName, "r") as fileobj:
                content = fileobj.read()

            self.ix = open_dir(self.settings.indexdir)
            # writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
コード例 #59
0
ファイル: indexing.py プロジェクト: denedios/moin-2.0
        with writer as writer:
            writer.update_document(**doc)  # update, because store_revision() may give us an existing revid
        doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            writer.update_document(**doc)

    def remove_revision(self, revid, async=True):
        """
        Remove a single revision from indexes.
        """
        if async:
            writer = AsyncWriter(self.ix[ALL_REVS])
        else:
            writer = self.ix[ALL_REVS].writer()
        with writer as writer:
            writer.delete_by_term(REVID, revid)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            # find out itemid related to the revid we want to remove:
            with self.ix[LATEST_REVS].searcher() as searcher:
                docnum_remove = searcher.document_number(revid=revid)
                if docnum_remove is not None:
                    itemid = searcher.stored_fields(docnum_remove)[ITEMID]
            if docnum_remove is not None:
コード例 #60
0
ファイル: feeds.py プロジェクト: djdarkbeat/bottle-fever
                'url': entry.link,
                'tags': get_entry_tags(entry),
                'when': when
            })

        if not len(entries):
            return

        log.debug("%s - %d entries in %fs" %
                  (netloc, len(entries), time.time() - now))
        now = time.time()

        records = 0
        now = time.time()
        ix = open_dir(settings.index)
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid=entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(id=item.id,
                                    guid=unicode(item.guid),
                                    title=entry['title'],
                                    text=plaintext,