Ejemplo n.º 1
19
 def delete_documents(self, doc_set, paths):
     """Delete documents from the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     query = And([
         Term('set', doc_set),
         Or([Term('path', path) for path in paths])
     ])
     writer.delete_by_query(query)
     writer.commit()
Ejemplo n.º 2
0
def add():
    d = request.get_json(force=True)
    url = d.get("url")
    content = d.get("content")
    if not url or not content: return jsonify({"status": "missing parameters"})
    if urlparse.urlparse(url).netloc.startswith("localhost"): return  jsonify({"status": "ignored"})
    ix = get_index()
    writer = AsyncWriter(ix)
    soup = BeautifulSoup(content)
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    writer.update_document(title=d.get("title", "Untitled"),
        url=url,
        content=text,
        modified=datetime.datetime.now())
    writer.commit()
    return jsonify({"status": "ok"})
Ejemplo n.º 3
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
Ejemplo n.º 4
0
def store_page(user, url):
    writer = AsyncWriter(idx)
    resp = requests.get(url)
    content = parse(resp.content)
    now = datetime.now()
    writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content)
    writer.commit()
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         writer.update_document(**doc)
     
     if len(iterable) > 0:
         # For now, commit no matter what, as we run into locking issues otherwise.
         writer.commit()
         
         # If spelling support is desired, add to the dictionary.
         if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
             sp = SpellChecker(self.storage)
             sp.add_field(self.index, self.content_field_name)
Ejemplo n.º 6
0
def incremental_index(t, l, c, dirname):
    id = (Searcher().getcount() + 1)
    ix = index.open_dir(dirname)
    # The set of all paths in the index
    #with ix.searcher() as searcher:

    indexed_feeds = set()

    with ix.searcher() as searcher:
      writer = AsyncWriter(ix)

      # Loop over the stored fields in the index
      for fields in searcher.all_stored_fields():
        indexed_feed = fields['title']
        indexed_feeds.add(indexed_feed)

      # Loop over the files in the filesystem
      # Assume we have a function that gathers the filenames of the
      # documents to be indexed
      if t not in indexed_feeds:
          # This is either a file that's changed, or a new file
          # that wasn't indexed before. So index it!
          wooshDocuments(id, writer, t, l, c)

      writer.commit()
      return id
Ejemplo n.º 7
0
    def delPage(self, item):

        index = item.childCount()
        while index > 0:
            index = index - 1
            self.dirname = item.child(index).text(0)
            self.delPage(item.child(index))

        # remove attachment folder
        attDir = self.itemToAttachmentDir(item)
        for info in QtCore.QDir(attDir).entryInfoList():
            QtCore.QDir().remove(info.absoluteFilePath())
        QtCore.QDir().rmdir(attDir)

        pagePath = self.itemToPage(item)
        self.ix = open_dir(self.settings.indexdir)
        query = QueryParser("path", self.ix.schema).parse(pagePath)
        # writer = self.ix.writer()
        writer = AsyncWriter(self.ix)
        n = writer.delete_by_query(query)
        # n = writer.delete_by_term('path', pagePath)
        writer.commit()
        # self.ix.close()
        b = QtCore.QDir(self.notePath).remove(self.pageToFile(pagePath))
        parent = item.parent()
        parentPage = self.itemToPage(parent)
        if parent is not None:
            index = parent.indexOfChild(item)
            parent.takeChild(index)
            if parent.childCount() == 0:  # if no child, dir not needed
                QtCore.QDir(self.notePath).rmdir(parentPage)
        else:
            index = self.indexOfTopLevelItem(item)
            self.takeTopLevelItem(index)
        QtCore.QDir(self.notePath).rmdir(pagePath)
Ejemplo n.º 8
0
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
Ejemplo n.º 9
0
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc)
                                for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
Ejemplo n.º 10
0
 def addLink(self, url, title, summary, txt):
     
     titleb = title + " "
     title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
     sumario = summary + " "
     sumario2 = sumario + sumario
     text = title10 + sumario2 + " " + txt
     
     ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
     writer = AsyncWriter(ix)
     writer.add_document(id=url, content=unicode(text)) 
     writer.commit()
     ix.close()
Ejemplo n.º 11
0
def whoosh_task(ids, pool_number, ix, model_class):
    session = sqla['session']

    writer = AsyncWriter(ix)
    for id_ in ids:
        obj = session.query(model_class).filter_by(id=id_).one()
        if obj.title is None or obj.summary is None:
            continue

        writer.add_document(
            title=obj.title,
            summary=obj.summary
        )

    writer.commit()
Ejemplo n.º 12
0
 def whoosh_index(self):
     it = QTreeWidgetItemIterator(
         self.notesTree, QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
Ejemplo n.º 13
0
 def index_documents(self, documents):
     """Add or update documents in the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     needs_commit = False
     for document in documents:
         needs_commit = True
         writer.update_document(
             uid=':'.join((document['set'], document['path'])),
             path=document['path'],
             set=document['set'],
             hash=document['hash'],
             title=document['title'],
             content=document['content'],
             kind=document['kind'],
         )
     if needs_commit:
         writer.commit()
Ejemplo n.º 14
0
    def clear(self):
        """Remove all content from indexes, and unregister all classes.

        After clear() the service is stopped. It must be started again
        to create new indexes and register classes.
        """
        logger.info("Resetting indexes")
        state = self.app_state

        for _name, idx in state.indexes.items():
            writer = AsyncWriter(idx)
            writer.commit(merge=True, optimize=True, mergetype=CLEAR)

        state.indexes.clear()
        state.indexed_classes.clear()
        state.indexed_fqcn.clear()
        self.clear_update_queue()

        if self.running:
            self.stop()
Ejemplo n.º 15
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "      
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return
        
        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()
        
        linkN = 1
        schema = Schema(id = TEXT(stored = True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:
            
            # Descartar links sem Titulo
            if( isinstance(feed[3], type(None))):
                #print "is Null"
                continue
            
            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1
            
            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " +feed[5]
            
            writer.add_document(id=index, content=unicode(text))
            
            
        writer.commit()
        ix.close()   
        print "    Done Loading from SQL"
Ejemplo n.º 16
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
Ejemplo n.º 17
0
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath+newPageName, content=content)
            writer.commit()
Ejemplo n.º 18
0
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         try:
             writer.update_document(**doc)
         except Exception, e:
             if not self.silently_fail:
                 raise
             
             self.log.error("Failed to add documents to Whoosh: %s", e)
Ejemplo n.º 19
0
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()        
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
             writer.commit()
         else:
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=content, tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
Ejemplo n.º 20
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       "data": {
                                           "index": index,
                                           "object": get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
Ejemplo n.º 21
0
def handle_document(document_id):
    document = Document.objects.get(id=document_id)

    mime_type = document.mime_type

    parser_class = get_parser_class_for_mime_type(mime_type)

    parser = parser_class(logging_group=uuid.uuid4())

    try:
        parser.parse(document.source_path, mime_type)

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), 'rb') as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                # i'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # we also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
                    content=parser.get_text()
                )
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(),
                                document.archive_path)

        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, document)

    except Exception as e:
        logger.error(f"Error while parsing document {document}: {str(e)}")
    finally:
        parser.cleanup()
Ejemplo n.º 22
0
class SearchPipeline(object):
    cleanup = False

    def open_spider(self, spider):
        """ When opening spider, open or create index. """

        index_dir = os.path.expanduser('~/.sitesearcher/index')
        if not os.path.exists(index_dir):
            os.makedirs(index_dir)

        self.indexname = spider.allowed_domains[0]
        if index.exists_in(index_dir, indexname=self.indexname):
            self.index = index.open_dir(index_dir, indexname=self.indexname)
        else:
            self.index = index.create_in(
                index_dir,
                indexname=self.indexname,
                schema=schema,
            )
        self.writer = AsyncWriter(self.index)

    def process_item(self, item, spider):
        """ Add crawled item to index.

        Add items using ``update_document`` to delete any previously indexed
        versions and avoid duplicates
        """

        self.writer.update_document(
            url=item.get('url'), content=item.get('content'))

    def close_spider(self, spider):
        """ Close index writer on closing of spider an clean up.

        On closing, delete any previously indexed items that have not been
        updated in this crawl, as these are obviously no longer reachable sites.
        """

        with self.index.searcher() as searcher:
            for page in searcher.all_stored_fields():
                if page['url'] not in spider.state['update_list']:
                    self.writer.delete_by_term('url', page['url'])
        self.writer.commit()
Ejemplo n.º 23
0
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):
    """
    Method that creates schema and stores index file based on the retrieved 'csv_test.csv' file  
    input:  
        selected_analyzer - selected text analyzer from the whoosh library
        name_of_file - name of .csv file stored from dataframe variable 'files_text'
        scoring_function - selected scoring function from the whoosh library
        path - path where index files are stored
    """
    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),\
    title=TEXT(stored=False, analyzer=selected_analyzer),
    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #vrating index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    #num_added_records_so_far=0
    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.
    #num_added_records_so_far = 0
    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

#num_added_records_so_far +=1
#if (num_added_records_so_far%1000 == 0):
#    print(" num_added_records_so_far= " + str(num_added_records_so_far))

    writer.commit()
    in_file.close()  #finish writing in the index file
Ejemplo n.º 24
0
    def delete(self, name, purge=True):
        """
        Delete a document by its name. name is actually a hash. If purge is true, file is also
        removed from the boxes.
        """
        # Grab a writer on the index
        writer = AsyncWriter(self.index)

        # Delete and commit ffom index
        writer.delete_by_term(u'hash', name)
        writer.commit()

        # Delete the document from the boxes if we want to purge them
        if not purge:
            return

        # We need to remove the doc is box is writable
        for box in self.boxes:
            if box.haskey(name) and not box.readonly:
                del (box[name])
Ejemplo n.º 25
0
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath,
                                self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
Ejemplo n.º 26
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return

        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()

        linkN = 1
        schema = Schema(id=TEXT(stored=True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:

            # Descartar links sem Titulo
            if (isinstance(feed[3], type(None))):
                #print "is Null"
                continue

            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1

            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " + feed[5]

            writer.add_document(id=index, content=unicode(text))

        writer.commit()
        ix.close()
        print "    Done Loading from SQL"
Ejemplo n.º 27
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        write = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object '%s' skipped", obj)
            else:
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                if 'boost' in doc:
                    del doc['boost']

                try:
                    write.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       'data': {
                                           'index': index,
                                           'object': get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            write.commit()
Ejemplo n.º 28
0
    def add_items(self, model, objs):
        for obj in objs:
            obj._body_ = self.prepare_body(obj)

        self._delete_parent_model_data(model, objs)

        index = self.backend.index.refresh()
        writer = AsyncWriter(index)

        for obj in objs:
            doc = {
                ID: get_identifier(obj),
                DJANGO_CT: get_model_ct(obj),
                DJANGO_ID: force_text(obj.pk),
                'text': force_text(obj._body_),
            }

            try:
                writer.update_document(**doc)
            except Exception as e:
                raise e

        if len(objs) > 0:
            writer.commit()
Ejemplo n.º 29
0
custom_stops=['rt','ht','mt','@','#','!',':',';',',','.',"'s","?","\\n",'http','https',"n't","&","\\",'...','-','"']
stops=list(set(default_stops+custom_stops))

#Set up schema fields
my_schema = Schema(id = ID(unique=True, stored=True),
                    text = TEXT(stored=True),
                    contains_retweet= BOOLEAN(stored=True),
                    screen_name = TEXT(stored=True),
                    keyword=KEYWORD(stored=True),
                    created=DATETIME(stored=True)
                    )


#Create index and AsyncWriter object
index = create_in("tweetindex", my_schema)
writer = AsyncWriter(index)

if __name__=='__main__':
    #Load raw data
    with open("WC2015_headers.csv",'rb') as to_load:
        data=csv.DictReader(to_load)
        for row in data:
            #Extract required information from date to create python datetime object
            date=row['created_at'][:19]+' '+row['created_at'][-4:]
            
            #Clean text and parse into keywords
            text=row['text'].replace('\\','')
            keywords=[word for word in word_tokenize(text) if word not in stops]
            
            #Check for Retweets
            rt=False
Ejemplo n.º 30
0
class Index(object):
    def __init__(self, directory, persist):
        self.log = logging.getLogger("ftpvista.index")

        self._persist = persist
        if not os.path.exists(directory):
            self.log.info("Creating the index in %s" % directory)
            os.mkdir(directory)
            self._idx = index.create_in(directory, schema=self.get_schema())
        else:
            self.log.info("Opening the index in %s" % directory)
            self._idx = index.open_dir(directory)

        self._searcher = self._idx.searcher()
        self._writer = None
        self.open_writer()
        self._last_optimization = None

    def open_writer(self):
        # self._writer = BufferedWriter(self._idx, 120, 4000)
        self._writer = AsyncWriter(self._idx)

    def get_schema(self):
        analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
        my_analyzer = analyzer | CharsetFilter(accent_map)
        return Schema(
            server_id=ID(stored=True),
            has_id=ID(),
            path=TEXT(analyzer=my_analyzer, stored=True),
            name=TEXT(analyzer=my_analyzer, stored=True),
            ext=TEXT(analyzer=my_analyzer, stored=True),
            size=ID(stored=True),
            mtime=ID(stored=True, sortable=True),
            audio_album=TEXT(analyzer=my_analyzer, stored=True),
            audio_artist=TEXT(analyzer=my_analyzer, stored=True),
            audio_title=TEXT(analyzer=my_analyzer, stored=True),
            audio_track=ID(stored=True),
            audio_year=ID(stored=True),
        )

    def delete_all_docs(self, server):
        self.open_writer()
        self._writer.delete_by_term("server_id", str(server.get_server_id()))
        self._writer.commit()
        self.log.info("All documents of server %s deleted" % server.get_ip_addr())

    def incremental_server_update(self, server_id, current_files):
        """Prepares to incrementaly update the documents for the given server.

        server_id      -- Id of the server to update.
        current_files  -- a list of (path, size, mtime) tuples for each files
                          currently on the server.

        Delete all the outdated files from the index and returns a list
        of files needing to be reindexed.
        """

        def delete_doc(writer, serverid, path):
            writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))

        # Build a {path => (size, mtime)} mapping for quick lookups
        to_index = {}
        for path, size, mtime in current_files:
            to_index[path] = (size, mtime)

        results = self._searcher.documents(server_id=str(server_id))
        if results:
            for fields in results:
                indexed_path = fields["path"]

                if indexed_path not in to_index:
                    # This file was deleted from the server since it was indexed
                    delete_doc(self._writer, server_id, indexed_path)
                    self.log.debug("%s has been removed" % indexed_path)
                else:
                    size, mtime = to_index[indexed_path]
                    try:
                        if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
                            # This file has been modified since it was indexed
                            delete_doc(self._writer, server_id, indexed_path)
                        else:
                            # up to date, no need to reindex
                            del to_index[indexed_path]
                    except ValueError:
                        delete_doc(self._writer, server_id, indexed_path)

        # return the remaining files
        return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]

    def add_document(
        self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
    ):
        """Add a document with the specified fields in the index.

        Changes need to be commited.

        """

        # passing the optional arguments is quite a mess
        # let's build a dict for that purpose

        _, ext = os.path.splitext(name)
        ext = ext.lstrip(".")

        kwargs = {
            "server_id": server_id,
            "name": name,
            "ext": ext,
            "path": path,
            "size": size,
            "mtime": mtime,
            "has_id": "a",
        }

        # Add the optional args
        if audio_album is not None:
            kwargs["audio_album"] = audio_album

        if audio_artist is not None:
            kwargs["audio_artist"] = audio_artist

        if audio_title is not None:
            kwargs["audio_title"] = audio_title

        if audio_year is not None:
            kwargs["audio_year"] = audio_year

        try:
            self._writer.add_document(**kwargs)
        except IndexingError:
            self.open_writer()
            self._writer.add_document(**kwargs)

    def commit(self, optimize=False):
        """ Commit the changes in the index and optimize it """
        self.log.info(" -- Begin of Commit -- ")
        try:
            self._writer.commit(optimize=optimize)
        except IndexingError:
            self.open_writer()
            self._writer.commit(optimize=optimize)
        self.log.info("Index commited")

        self._searcher = self._idx.searcher()
        self.log.info(" -- End of Commit -- ")

    def close(self):
        self.log.info(" -- Closing writer and index -- ")
        # self._writer.close()
        """ Close the index """
        self._idx.close()
Ejemplo n.º 31
0
 def add(self, note):
     writer = AsyncWriter(self.index)
     writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.commit()
Ejemplo n.º 32
0
 def index_mail(self, mail):
     with AsyncWriter(self._index) as writer:
         self._index_mail(writer, mail)
Ejemplo n.º 33
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
Ejemplo n.º 34
0
 def remove(self, item_id):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.delete_by_term('id', item_id)
     writer.commit()
Ejemplo n.º 35
0
 def get_writer(self):
     return AsyncWriter(self.index)
Ejemplo n.º 36
0
def insert_docs(docs):
    ix = open_dir(whoosh_index)
    writer = AsyncWriter(ix)
    for doc in docs:
        writer.add_document(**doc)
    writer.commit()
Ejemplo n.º 37
0
                            'html'   : html,
                            'url'    : entry.link,
                            'tags'   : get_entry_tags(entry),
                            'when'   : when})
        
        if not len(entries):
            return

        log.debug("%s - %d entries in %fs" % (netloc, len(entries),time.time()-now))
        now = time.time()
        
        
        records = 0
        now = time.time()
        ix = open_dir(settings.index)
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid = entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(
                    id = item.id,
                    guid = unicode(item.guid),
                    title = entry['title'],
Ejemplo n.º 38
0
    def load_all_dset_metadata(self, dsetname, create_index=False):
        """
            Loads into memory the metadata of a dataset. The metadata is read from a CSV file, which should
            have at least two columns:
             - filename: Paths to the images in the dataset, relative to the image data folder. For backward
                         compatibility '#filename' is also accepted
             - file_attributes: JSON string containing information about the file. The most important file
                                attributes are 'caption' and 'keywords'. The 'caption' field should be a short
                                string which will be used as the caption of the image in result lists. The
                                'keywords' field must contain a comma-separated list of keywords. Each keyword
                                can be used as the source for a search.
            If create_index is True, it builds a search index with the 'keywords' in the file_attributes.
            Arguments:
                dsetname: String corresponding to the dataset within the list of supported
                          datasets.
                create_index: Boolean indicating whether or not to build a search index
                              with the metadata
        """
        metaindex = None
        t = time.time()
        try:
            for afile in os.listdir(os.path.join(self.metadata_dir, dsetname)):
                if afile.endswith(".csv"):
                    metadata_file = os.path.join(self.metadata_dir, dsetname,
                                                 afile)
                    print('Found metadata file at', metadata_file)
                    if create_index:
                        metaindex = open_dir(self.index_dir)
                    with open(metadata_file, 'r') as fin:
                        reader = csv.DictReader(fin)
                        for row in reader:
                            id_field = None
                            if 'filename' in row.keys():
                                id_field = 'filename'
                            elif '#filename' in row.keys():
                                id_field = '#filename'
                            if id_field and 'file_attributes' in row.keys():
                                filename = row[id_field]
                                try:
                                    self.fname2meta[dsetname][
                                        filename] = json.loads(
                                            row['file_attributes'])
                                except:
                                    self.fname2meta[dsetname][filename] = None
                                metadata = self.fname2meta[dsetname][filename]
                                keyword_list = None
                                if metadata and 'keywords' in metadata.keys():
                                    keyword_list = metadata['keywords']
                                if keyword_list and create_index:
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    writer = AsyncWriter(metaindex)
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        # delete previous entry if found
                                        query = QueryParser(
                                            'key', metaindex.schema).parse(key)
                                        writer.delete_by_query(
                                            query, metaindex.searcher())
                                        # add document
                                        writer.add_document(
                                            key=str(key),
                                            dataset=str(dsetname))
                                    writer.commit()
                                if keyword_list:  # we would like to do this, even if the index is not created
                                    # register link keyword-file
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        if key in self.keyword2fname[
                                                dsetname].keys():
                                            self.keyword2fname[dsetname][
                                                key].append(filename)
                                        else:
                                            self.keyword2fname[dsetname][
                                                key] = [filename]
                            else:
                                raise Exception(
                                    '"filename" and/or "file_attributes" columns not found in '
                                    + afile +
                                    ' (are you missing the column names?). Metadata will not be available!.'
                                )

                        print('Finished loading metadata for %s in %s' %
                              (dsetname, str(time.time() - t)))
                        self.is_all_metadata_loaded = True
                    break
        except Exception as e:
            print("load_all_dset_metadata Exception:" + str(e) + '\n')
Ejemplo n.º 39
0
 def index_mail(self, mail):
     if mail is not None:
         with AsyncWriter(self._index) as writer:
             self._index_mail(writer, mail)
Ejemplo n.º 40
0
 def remove(self, instance: Model):
     """Remove an entry from the index. Non-blocking.
     :param instance: instance of ``self.model`` to be removed from the index
     """
     with AsyncWriter(self.index) as writer:
         writer.delete_by_term(self.pk_name, getattr(instance, self.pk_name))
Ejemplo n.º 41
0
        i = 0
        line = docs.readline()
        pbar = tqdm(total=3_213_835)
        while line != "":
            _, url, _, _ = line.split("\t")
            writer.update_document(url_text=url.replace(".", " "))
            line = docs.readline()
            i += 1
            pbar.update(1)
    writer.commit()
    exit(0)

ix.writer().commit(mergetype=writing.CLEAR)

print(f"Loading documents from {args.data}")
writers = [AsyncWriter(ix) for _ in range(args.threads)]
with open(args.data, "r", encoding="utf-8") as docs:
    i = 0
    line = docs.readline()
    pbar = tqdm(
        total=args.num_docs if args.num_docs is not None else 3_213_835)
    while line != "" and (args.num_docs is None or i < args.num_docs):
        docid, url, title, body = line.split("\t")
        writers[i % args.threads].add_document(docid=docid,
                                               url=url,
                                               title=title,
                                               body=body)
        line = docs.readline()
        i += 1
        pbar.update(1)
pbar.set_description("Committing...")
Ejemplo n.º 42
0
 def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None):
     ix = open_dir(LOCAL_FTS_INDEX)
     writer = AsyncWriter(ix)
     writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags)
     writer.commit()
Ejemplo n.º 43
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = "{}:{}".format(cls_name, pk)
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except BaseException:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
Ejemplo n.º 44
0
 def insert(self, link, title, document):
     writer = AsyncWriter(self.ix)
     writer.add_document(link=link,title=title, document=document + title)
     writer.commit()
Ejemplo n.º 45
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
Ejemplo n.º 46
0
def get_writer():
    global WIX
    writer = AsyncWriter(WIX)
    return writer
Ejemplo n.º 47
0
    t2 = time()
    print 'Years', '-'.join(years)

    print 'Restarting database...'
    #	db=create_engine(connection_string,pool_size=proc_num)
    engine = db.connect()
    metadata = MetaData(engine)
    inspector = inspect(engine)
    #os.system('mongod -f /etc/mongodb.conf --shutdown')
    #os.system('mongod -f /etc/mongodb.conf &')
    print 'Done!'
    print

    #client=MongoClient()
    #db=client['pubmed']
    writer = AsyncWriter(index)

    for year in years:
        #collection=db[year]
        #total=collection.count()
        table_name = 'pubmed_sent_' + year
        table = Table(table_name, metadata, autoload=True)
        statement = table.count()
        total = engine.execute(statement).fetchone()[0]
        print 'Doc count', str(total)
        print

        t = time()
        num = 0
        #for post in collection.find():
        statement = table.select()
Ejemplo n.º 48
0
def index_optimize():
    ix = index.open_index()
    writer = AsyncWriter(ix)
    writer.commit(optimize=True)
Ejemplo n.º 49
0
 def optimize(self):
     writer = AsyncWriter(self.index)
     writer.commit(optimize=True)
Ejemplo n.º 50
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir,
                              **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get(
                        'full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
Ejemplo n.º 51
0
 def remove_from_index(self, mail_id):
     with AsyncWriter(self._index) as writer:
         writer.delete_by_term('ident', mail_id)
Ejemplo n.º 52
0
 def get_writer(indexname=None, schema=None):
     return AsyncWriter(Index.get_index(indexname=indexname, schema=schema))
Ejemplo n.º 53
0
 def search(self, query_string, notebook_id=None):
     with AsyncWriter(self.index).searcher() as searcher:
         query_parser = MultifieldParser(["title", "snippet"], schema=self.index.schema).parse(query_string)
         notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None
         results = searcher.search(query_parser, filter=notebook_filter, limit=None)
         return [res['note_id'] for res in results]
Ejemplo n.º 54
0
def get_writer(ix):
    writer = AsyncWriter(ix)
    # writer = ix.writer()
    return writer
Ejemplo n.º 55
0
 def open_writer(self):
     # self._writer = BufferedWriter(self._idx, 120, 4000)
     self._writer = AsyncWriter(self._idx)
Ejemplo n.º 56
0
 def get_writer(self, parent_path):
     logger.debug(f"Getting index writer for path:{parent_path}")
     return WhooshPathIndexer(AsyncWriter(self.ix), parent_path)
Ejemplo n.º 57
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = f"{cls_name}:{pk}"
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except Exception:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
Ejemplo n.º 58
0
    def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/")
        if not newPageName:
            if useTemplate:
                dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self)
                if dialog.exec_():
                    curTitleIdx = dialog.titleTemplates.currentIndex()
                    curBodyIdx = dialog.bodyTemplates.currentIndex()
                    dtnow = datetime.datetime.now()
                    if curTitleIdx > -1:
                        titleItem = dialog.titleTemplates.model().item(curTitleIdx)
                        titleItemContent = titleItem.data(TTPL_COL_DATA)
                        titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA)
                        titleParameter = dialog.titleTemplateParameter.text()
                        newPageName = mikitemplate.makeTemplateTitle(
                            titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter
                        )
                    if curBodyIdx > -1:
                        bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0)
                        bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx)
                    else:
                        bodyFPath = None
            else:
                dialog = LineEditDialog(pagePath, self)
                if dialog.exec_():
                    newPageName = dialog.editor.text()

        prevparitem = None

        if newPageName:
            if hasattr(item, "text"):
                pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/")
            if not QtCore.QDir(pagePath).exists():
                QtCore.QDir(self.notePath).mkdir(pagePath)

            if not QtCore.QDir(os.path.dirname(newPageName)).exists():
                curdirname = os.path.dirname(newPageName)
                needed_parents = []
                while curdirname != "":
                    needed_parents.append(curdirname)
                    curdirname = os.path.dirname(curdirname)

                # create the needed hierarchy in reverse order
                for i, needed_parent in enumerate(needed_parents[::-1]):
                    paritem = self.pageToItem(needed_parent)
                    if paritem is None:
                        if i == 0:
                            self.newPageCore(item, os.path.basename(needed_parent))
                        else:
                            self.newPageCore(prevparitem, os.path.basename(needed_parent))
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists():
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    if paritem is not None:
                        prevparitem = paritem
                    else:
                        prevparitem = self.pageToItem(needed_parent)

            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QtCore.QFile(fileName)
            fh.open(QtCore.QIODevice.WriteOnly)

            savestream = QtCore.QTextStream(fh)
            if useTemplate and bodyFPath is not None:
                with open(bodyFPath, "r", encoding="utf-8") as templatef:
                    savestream << mikitemplate.makeTemplateBody(
                        os.path.basename(newPageName),
                        dtnow=dtnow,
                        dt_in_body_txt=self.tr("Created {}"),
                        body=templatef.read(),
                    )
            else:
                savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}"))
            fh.close()
            if prevparitem is not None:
                QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)])
            else:
                QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, "text"):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QtCore.QDir(attDir).exists():
                QtCore.QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            with open(fileName, "r") as fileobj:
                content = fileobj.read()

            self.ix = open_dir(self.settings.indexdir)
            # writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
Ejemplo n.º 59
0
        with writer as writer:
            writer.update_document(**doc)  # update, because store_revision() may give us an existing revid
        doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            writer.update_document(**doc)

    def remove_revision(self, revid, async=True):
        """
        Remove a single revision from indexes.
        """
        if async:
            writer = AsyncWriter(self.ix[ALL_REVS])
        else:
            writer = self.ix[ALL_REVS].writer()
        with writer as writer:
            writer.delete_by_term(REVID, revid)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            # find out itemid related to the revid we want to remove:
            with self.ix[LATEST_REVS].searcher() as searcher:
                docnum_remove = searcher.document_number(revid=revid)
                if docnum_remove is not None:
                    itemid = searcher.stored_fields(docnum_remove)[ITEMID]
            if docnum_remove is not None:
Ejemplo n.º 60
0
                'url': entry.link,
                'tags': get_entry_tags(entry),
                'when': when
            })

        if not len(entries):
            return

        log.debug("%s - %d entries in %fs" %
                  (netloc, len(entries), time.time() - now))
        now = time.time()

        records = 0
        now = time.time()
        ix = open_dir(settings.index)
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid=entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(id=item.id,
                                    guid=unicode(item.guid),
                                    title=entry['title'],
                                    text=plaintext,