Beispiel #1
0
 def _updateScores(self, cursor, db_document_id, text):
     # insert or update in table document_score
     db_scores = self._getScoresDict(cursor, db_document_id)
     doc_scores = {}
     # We update the document_score table only for the first
     # occurence of the word in the document
     for match in WORDS_RGX.finditer(normalizeText(text)):
         word = match.group(0)
         if word in doc_scores:
             continue
         doc_scores[word] = 0
         position = match.start()
         if word in db_scores:
             if db_scores[word].position != position:
                 db_scores[word].position = position
                 db_scores[word].commit(cursor, update=True)
         else:
             # insert a row in the Word table if required
             self._ensureWordInDatabase(cursor, word)
             db_score = DocumentScore(db_document_id=db_document_id,
                                      word=word,
                                      position=position,
                                      download_count=0.,
                                      relevance=0.,
                                      popularity=0.)
             db_score.commit(cursor, update=False)
Beispiel #2
0
 def _updateScores(self, cursor, db_document_id, text):
     # insert or update in table document_score
     db_scores = self._getScoresDict(cursor, db_document_id)
     doc_scores = {}
     # We update the document_score table only for the first
     # occurence of the word in the document
     for match in WORDS_RGX.finditer(normalizeText(text)):
         word = match.group(0)
         if word in doc_scores:
             continue
         doc_scores[word] = 0
         position = match.start()
         if word in db_scores :
             if db_scores[word].position != position:
                 db_scores[word].position = position
                 db_scores[word].commit(cursor, update=True)
         else:
             # insert a row in the Word table if required
             self._ensureWordInDatabase(cursor, word)
             db_score = DocumentScore(db_document_id=db_document_id,
                                      word=word,
                                      position=position,
                                      download_count=0.,
                                      relevance=0.,
                                      popularity=0.)
             db_score.commit(cursor, update = False)
Beispiel #3
0
    def _updateDownloadStatistics(self, document, words):
        cursor = self._cnx.cursor()
        document.download_count = max(0, document.download_count) + 1
        document.commit(cursor, update=True)
        db_document_id = document.db_document_id
        scores = {}
        wordInfo = {}
        for word in words:
            scores[word] = DocumentScore.selectOrInsertWhere(
                cursor, db_document_id=db_document_id, word=word)[0]
            wordInfo[word] = Word.selectOrInsertWhere(cursor, word=word)[0]

        for winfo in wordInfo.itervalues():
            winfo.download_count += 1 / len(words)
            winfo.commit(cursor, update=True)

        for word, score in scores.iteritems():
            score.download_count = max(0,
                                       score.download_count) + 1 / len(words)
            winfo_downloads = wordInfo[word].download_count

            score.popularity = score.download_count / winfo_downloads
            score.popularity -= hoeffding_deviation(winfo_downloads)

            score.relevance = score.download_count / document.download_count
            score.relevance -= hoeffding_deviation(document.download_count)

            score.commit(cursor, update=True)
        cursor.close()
        self._cnx.commit()
Beispiel #4
0
    def _updateDownloadStatistics(self, document, words):
        cursor = self._cnx.cursor()
        document.download_count = max(0, document.download_count) + 1
        document.commit(cursor, update=True)
        db_document_id = document.db_document_id
        scores = {}
        wordInfo = {}
        for word in words:
            scores[word] = DocumentScore.selectOrInsertWhere(cursor,
                                      db_document_id=db_document_id,
                                      word=word)[0]
            wordInfo[word] = Word.selectOrInsertWhere(cursor,
                                                      word=word)[0]

        for winfo in wordInfo.itervalues():
            winfo.download_count += 1 / len(words)
            winfo.commit(cursor, update=True)

        for word,score in scores.iteritems():
            score.download_count = max(0, score.download_count) + 1.0 / len(words)
            winfo_downloads = wordInfo[word].download_count
            
            score.popularity = float(score.download_count) / winfo_downloads
            score.popularity -= hoeffding_deviation(winfo_downloads)
            score.popularity = max(1e-6, score.popularity)
            
            score.relevance = float(score.download_count) / document.download_count
            score.relevance -= hoeffding_deviation(document.download_count)
            score.relevance = max(1e-6, score.relevance)
            
            score.commit(cursor, update=True)
        cursor.close()
        self._cnx.commit()
Beispiel #5
0
 def _getScoresDict(self, cursor, db_document_id):
     _scores = DocumentScore.selectWhere(cursor, db_document_id=db_document_id)
     db_scores = {}
     while _scores:
         score = _scores.pop()
         db_scores[score.word] = score
     return db_scores
Beispiel #6
0
 def _getScoresDict(self, cursor, db_document_id):
     _scores = DocumentScore.selectWhere(cursor,
                                         db_document_id=db_document_id)
     db_scores = {}
     while _scores:
         score = _scores.pop()
         db_scores[score.word] = score
     return db_scores