Beispiel #1
0
    def indexDocument(self, filename, title, text, fileSize, lastModifiedOn,
                      content_hash, mime_type, state, file_state):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor, file_name=filename)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = lastModifiedOn
            fileinfo.state = state
            fileinfo.file_state = file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id != content_hash:
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor, content_hash, title, text,
                                           fileSize, lastModifiedOn, filename,
                                           state)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]

            fileinfo.commit(cursor, update=True)

        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor, document_id=content_hash)
            if doc:
                doc = doc[0]
                doc.state = state
                doc.publication_time = max(doc.publication_time,
                                           lastModifiedOn)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor, content_hash, title, text,
                                           fileSize, lastModifiedOn, filename,
                                           state)
                doc = Document.selectWhere(cursor, document_id=content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                file_name=filename,
                                file_time=lastModifiedOn,
                                state=state,
                                file_state=file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, text)
        cursor.close()
        self._cnx.commit()
Beispiel #2
0
 def _createDocument(self, cursor, content_hash, title, text, fileSize,
                     lastModifiedOn, filename, state):
     doc = Document(document_id=content_hash,
                    title=title,
                    text=text,
                    size=fileSize,
                    publication_time=lastModifiedOn,
                    download_count=0.,
                    url=filename,
                    matching=0.,
                    indexed='1',
                    state=state)
     doc.commit(cursor, update=False)
     doc = Document.selectWhere(cursor, document_id=content_hash)[0]
     return doc
Beispiel #3
0
 def _createDocument(self, cursor, content_hash, title, text, fileSize,
                     lastModifiedOn, filename, state):
     doc = Document(document_id=content_hash,
                    title=title,
                    text=text,
                    size=fileSize,
                    publication_time=lastModifiedOn,
                    download_count=0.,
                    url=filename,
                    matching=0.,
                    indexed='1',
                    state=state)
     doc.commit(cursor, update=False)
     doc = Document.selectWhere(cursor, document_id=content_hash)[0]
     return doc
Beispiel #4
0
    def _createDocument(self, cursor, futureDoc):

        doc = Document(document_id=futureDoc.content_hash,
                       title=futureDoc.title,
                       text=futureDoc.text[:MAX_STORED_SIZE],
                       size=futureDoc.fileSize,
                       publication_time=futureDoc.lastModificationTime,
                       download_count=0.,
                       url=futureDoc.filename,
                       mime_type=futureDoc.mime_type,
                       matching=0.,
                       indexed='1',
                       state=futureDoc.state)
        doc.commit(cursor, update=False)
        doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]
        return doc
Beispiel #5
0
    def _createDocument(self, cursor, futureDoc):

        doc = Document(document_id=futureDoc.content_hash,
                       title=futureDoc.title,
                       text=futureDoc.text[:MAX_STORED_SIZE],
                       size=futureDoc.fileSize,
                       publication_time=futureDoc.lastModificationTime,
                       download_count=0.,
                       url=futureDoc.filename,
                       mime_type=futureDoc.mime_type,
                       matching=0.,
                       indexed='1',
                       state=futureDoc.state)
        doc.commit(cursor, update=False)
        doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]
        return doc
Beispiel #6
0
 def getDocumentCount(self):
     """get document count"""
     try:
         cursor = self._cnx.cursor()
         docCounts = Document.getDocumentCount(cursor)
     finally:
         cursor.close()
     return docCounts
Beispiel #7
0
 def getDocumentCount(self):
     """get document count"""
     try:
         cursor = self._cnx.cursor()
         docCounts = Document.getDocumentCount(cursor)
     finally:
         cursor.close()
     return docCounts
Beispiel #8
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     words = WORDS_RGX.findall(normalizeText(query))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words)
     finally:
         cursor.close()
Beispiel #9
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     words = WORDS_RGX.findall(normalizeText(query))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words)
     finally:
         cursor.close()
Beispiel #10
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     # TODO: order results using document_scores information
     words = WORDS_RGX.findall(normalizeText(unicode(query.words)))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words, query.filetype,
                                          query.offset, self.searchInPrivate)
     finally:
         cursor.close()
Beispiel #11
0
 def notifyDownload(self, db_document_id, query):
     words = WORDS_RGX.findall(normalizeText(query))
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, db_document_id=db_document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         return ''
Beispiel #12
0
 def notifyDownload(self, db_document_id, query):
     words = WORDS_RGX.findall(normalizeText(query))
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, db_document_id=db_document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         return ''
Beispiel #13
0
 def notifyDownload(self, document_id, words):
     #words = [WORDS_RGX.findall(normalizeText(unicode(word)))
     #         for word in words]
     print "Querier notifyDownloads %s with %s" % (document_id, words)
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, document_id=document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         raise
Beispiel #14
0
 def notifyDownload(self, document_id, words):
     #words = [WORDS_RGX.findall(normalizeText(unicode(word)))
     #         for word in words]
     print "Querier notifyDownloads %s with %s" % (document_id, words)
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, document_id=document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         raise
Beispiel #15
0
    def indexDocument(self, filename, title, text, fileSize, lastModifiedOn,
                      content_hash, mime_type, state, file_state):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor,
                                        file_name=filename)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = lastModifiedOn
            fileinfo.state = state
            fileinfo.file_state = file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id!=content_hash :
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor,
                                           content_hash,
                                           title,
                                           text,
                                           fileSize,
                                           lastModifiedOn,
                                           filename,
                                           state)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]
                
            fileinfo.commit(cursor, update=True)
                
        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor, document_id=content_hash)
            if doc:
                doc = doc[0]
                doc.state = state
                doc.publication_time = max(doc.publication_time, lastModifiedOn)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor,
                                           content_hash,
                                           title,
                                           text,
                                           fileSize,
                                           lastModifiedOn,
                                           filename,
                                           state)
                doc = Document.selectWhere(cursor, document_id=content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                 file_name=filename,
                                 file_time=lastModifiedOn,
                                 state=state,
                                 file_state=file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, text)
        cursor.close()
        self._cnx.commit()        
Beispiel #16
0
 def testAddMatch(self):
     doc = Document(document_id='0' * 40)
     self.query.addMatch(doc.__dict__)
     self.failUnless('0' * 40 in self.query.documents_ids)
Beispiel #17
0
    def indexDocument(self, nodeId, futureDoc):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and
        # pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename)
        # insert title into text to be able to find documents according
        # to their title (e.g: searching 'foo' should find 'foo.pdf')
        futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = futureDoc.lastModificationTime
            fileinfo.state = futureDoc.state
            fileinfo.file_state = futureDoc.file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id!=futureDoc.content_hash :
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor, futureDoc)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]
                if doc.state != futureDoc.state:
                    doc.state = futureDoc.state
                    doc.commit(cursor, update=True)
                
            fileinfo.commit(cursor, update=True)
                
        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor,
                                       document_id=futureDoc.content_hash)
            if doc:
                doc = doc[0]
                doc.state = futureDoc.state
                doc.publication_time = max(doc.publication_time,
                                           futureDoc.lastModificationTime)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor, futureDoc)
                doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                file_name=futureDoc.filename,
                                file_time=futureDoc.lastModificationTime,
                                state=futureDoc.state,
                                file_state=futureDoc.file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, futureDoc.text)
        provider = DocumentProvider.selectOrInsertWhere(cursor,
                                          db_document_id=doc.db_document_id,
                                          node_id=nodeId)[0]
        provider.last_providing_time = int(time.time())
        provider.commit(cursor, update=True)
        node = Node.selectWhere(cursor, node_id=nodeId)[0]
        node.last_seen_time = int(time.time())
        node.commit(cursor, update=True)
        cursor.close()
        self._cnx.commit()
Beispiel #18
0
 def testIsKnown(self):
     doc = Document(document_id='0' * 40)
     self.query.addMatch(doc.__dict__)
     self.failUnless(self.query.isKnown(doc.__dict__))
     self.failIf(self.query.isKnown(
         Document(document_id='1' * 40).__dict__))
Beispiel #19
0
 def indexDocument(self, nodeId, futureDoc):
     """Inserts or update information in table documents,
     file_info, document_score and word
     :type nodeId: node_id or None if working locally
     """
     # XXX Decide if we can compute the content_hash and mime_type
     # ourselves or if the indexer should do it and
     # pass the values as an argument
     cursor = self._cnx.cursor()
     try:
         # insert or update in table file_info
         fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename)
         # insert title into text to be able to find documents according
         # to their title (e.g: searching 'foo' should find 'foo.pdf')
         futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text)
         if fileinfo:
             fileinfo = fileinfo[0]
             fileinfo.file_time = futureDoc.lastModificationTime
             fileinfo.state = futureDoc.state
             fileinfo.file_state = futureDoc.file_state
             doc = Document.selectWhere(cursor,
                                        db_document_id=fileinfo.db_document_id)
             if not doc or doc[0].document_id!=futureDoc.content_hash :
                 # no document was found or a document with another content
                 # in both case, we create a new Document in database
                 # (we don't want to modify the existing one, because it
                 # can be shared by several files)
                 doc = self._createDocument(cursor, futureDoc)
                 fileinfo.db_document_id = doc.db_document_id
             else:
                 # document has not changed
                 doc = doc[0]
                 if doc.state != futureDoc.state:
                     doc.state = futureDoc.state
                     doc.commit(cursor, update=True)
             fileinfo.commit(cursor, update=True)
         else:
             # file unknown
             # try to find a Document with same hash value
             doc = Document.selectWhere(cursor,
                                        document_id=futureDoc.content_hash)
             if doc:
                 doc = doc[0]
                 doc.state = futureDoc.state
                 doc.publication_time = max(doc.publication_time,
                                            futureDoc.lastModificationTime)
                 doc.commit(cursor, update=True)
             else:
                 doc = self._createDocument(cursor, futureDoc)
                 doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]
             fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                 file_name=futureDoc.filename,
                                 file_time=futureDoc.lastModificationTime,
                                 state=futureDoc.state,
                                 file_state=futureDoc.file_state)
             fileinfo.commit(cursor, update=False)
         self._updateScores(cursor, doc.db_document_id, futureDoc.text)
         # update last seen time only if not working locally
         if nodeId is not None:
             provider = DocumentProvider.selectOrInsertWhere(cursor,
                                                             db_document_id=doc.db_document_id,
                                                             node_id=nodeId)[0]
             provider.last_providing_time = int(time.time())
             provider.commit(cursor, update=True)
             nodes = Node.selectWhere(cursor, node_id=nodeId)
             if not nodes:
                 self._cnx.rollback()
                 cursor.close()
                 raise ValueError('provider %s is not registered in our database !')
             node = nodes[0]
             node.last_seen_time = int(time.time())
             node.commit(cursor, update=True)
         cursor.close()
         self._cnx.commit()
     except:
         self._cnx.rollback()
         raise