Example #1
0
 def _updateScores(self, cursor, db_document_id, text):
     # insert or update in table document_score
     db_scores = self._getScoresDict(cursor, db_document_id)
     doc_scores = {}
     # We update the document_score table only for the first
     # occurence of the word in the document
     for match in WORDS_RGX.finditer(normalizeText(text)):
         word = match.group(0)
         if word in doc_scores:
             continue
         doc_scores[word] = 0
         position = match.start()
         if word in db_scores:
             if db_scores[word].position != position:
                 db_scores[word].position = position
                 db_scores[word].commit(cursor, update=True)
         else:
             # insert a row in the Word table if required
             self._ensureWordInDatabase(cursor, word)
             db_score = DocumentScore(db_document_id=db_document_id,
                                      word=word,
                                      position=position,
                                      download_count=0.,
                                      relevance=0.,
                                      popularity=0.)
             db_score.commit(cursor, update=False)
Example #2
0
 def _updateScores(self, cursor, db_document_id, text):
     # insert or update in table document_score
     db_scores = self._getScoresDict(cursor, db_document_id)
     doc_scores = {}
     # We update the document_score table only for the first
     # occurence of the word in the document
     for match in WORDS_RGX.finditer(normalizeText(text)):
         word = match.group(0)
         if word in doc_scores:
             continue
         doc_scores[word] = 0
         position = match.start()
         if word in db_scores :
             if db_scores[word].position != position:
                 db_scores[word].position = position
                 db_scores[word].commit(cursor, update=True)
         else:
             # insert a row in the Word table if required
             self._ensureWordInDatabase(cursor, word)
             db_score = DocumentScore(db_document_id=db_document_id,
                                      word=word,
                                      position=position,
                                      download_count=0.,
                                      relevance=0.,
                                      popularity=0.)
             db_score.commit(cursor, update = False)
 def testParseHtmlFileWithEncoding(self):
     filename = join(DATADIR, 'encoded.html')
     title, text, links, offset = self.parser.parseFile(filename, 'encoded.html', 'iso-8859-1')
     self.assertEquals(title, 'maille Maay')
     self.assertEquals(normalizeText(text),
                       'hello ete world this is a link and this is another link')
     self.assertEquals(links, ['something.com', 'somethingelse.com'])
Example #4
0
    def _selectContainingQuery(cls, words):
        words = [normalizeText(unicode(w))
                 for w in words
                 if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN]
        if not words:
            return ''

        # Question: what is the HAVING clause supposed to do ?
        # Answer: we select all documents containing one of the words
        # that we are looking for, group them by their identifier, and
        # only keep those identifier which appeared once for each word
        # we were looking for.
        query = ("SELECT D.db_document_id, "
                        "D.document_id, "
                        "D.title, "
                        "D.size, "
                        "D.text, "
                        "D.url, "
                        "D.mime_type "
                 "FROM documents D, document_scores DS "
                 "WHERE DS.db_document_id=D.db_document_id "
                     "AND DS.word IN (%s) "
                   "GROUP BY DS.db_document_id "
                   "HAVING count(DS.db_document_id) = %%s" % \
                   (', '.join(['%s'] * len(words))))

        return query, words + [len(words)]
 def testParseRaw(self):
     html = '<body>%s</body>' % RAW_TEXT
     title, text, links, offset = self.parser.parseString(html)
     # parseString() should return empty title when non available in the HTML
     self.assertEquals(title, '')
     self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e'))
     self.assertEquals(links, [])
 def testParseSimpleHtml(self):
     title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
     self.assertEquals(title, 'maille Maay')
     self.assertEquals(
         normalizeText(text),
         'hello ete world this is a link and this is another link')
     self.assertEquals(links, ['something.com', 'somethingelse.com'])
Example #7
0
    def _selectContainingQuery(cls, words):
        words = [
            normalizeText(unicode(w)) for w in words
            if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN
        ]
        if not words:
            return ''

        # Question: what is the HAVING clause supposed to do ?
        # Answer: we select all documents containing one of the words
        # that we are looking for, group them by their identifier, and
        # only keep those identifier which appeared once for each word
        # we were looking for.
        query = ("SELECT D.db_document_id, "
                        "D.document_id, "
                        "D.title, "
                        "D.size, "
                        "D.text, "
                        "D.url, "
                        "D.mime_type "
                 "FROM documents D, document_scores DS "
                 "WHERE DS.db_document_id=D.db_document_id "
                     "AND DS.word IN (%s) "
                   "GROUP BY DS.db_document_id "
                   "HAVING count(DS.db_document_id) = %%s" % \
                   (', '.join(['%s'] * len(words))))

        return query, words + [len(words)]
 def render_prevset_url(self, context, data):
     words = WORDS_RGX.findall(
         normalizeText(unicode(context.arg('words'), 'utf-8')))
     offset = int(context.arg('offset', 0))
     if offset:
         offset -= 15
     return 'search?words=%s&offset=%s' % ('+'.join(words), offset)
 def testParseRaw(self):
     html = '<body>%s</body>' % RAW_TEXT
     title, text, links, offset = self.parser.parseString(html)
     # parseString() should return empty title when non available in the HTML
     self.assertEquals(title, '')
     self.assertEquals(normalizeText(text),
                       RAW_TEXT.replace(u'é', 'e'))
     self.assertEquals(links, [])
 def testTitleGuess(self):
     """Make sure the title is the filename when we treat a text file
        or no title could be found
     """
     title, text, links, offset = self.parser.parseFile(join(DATADIR, "notitle.html"), 'notitle.html')
     self.assertEquals(title, 'notitle.html')
     self.assertEquals(normalizeText(text), "maille maay")
     self.assertEquals(links, [])
Example #11
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     words = WORDS_RGX.findall(normalizeText(query))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words)
     finally:
         cursor.close()
Example #12
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     words = WORDS_RGX.findall(normalizeText(query))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words)
     finally:
         cursor.close()
 def testParseHtmlFileWithEncoding(self):
     filename = join(DATADIR, 'encoded.html')
     title, text, links, offset = self.parser.parseFile(
         filename, 'encoded.html', 'iso-8859-1')
     self.assertEquals(title, 'maille Maay')
     self.assertEquals(
         normalizeText(text),
         'hello ete world this is a link and this is another link')
     self.assertEquals(links, ['something.com', 'somethingelse.com'])
 def testTitleGuess(self):
     """Make sure the title is the filename when we treat a text file
        or no title could be found
     """
     title, text, links, offset = self.parser.parseFile(
         join(DATADIR, "notitle.html"), 'notitle.html')
     self.assertEquals(title, 'notitle.html')
     self.assertEquals(normalizeText(text), "maille maay")
     self.assertEquals(links, [])
Example #15
0
 def findDocuments(self, query):
     """Find all indexed documents matching the query"""
     # TODO: order results using document_scores information
     words = WORDS_RGX.findall(normalizeText(unicode(query.words)))
     self._updateQueryStatistics(words)
     try:
         cursor = self._cnx.cursor()
         return Document.selectContaining(cursor, words, query.filetype,
                                          query.offset, self.searchInPrivate)
     finally:
         cursor.close()
Example #16
0
 def notifyDownload(self, db_document_id, query):
     words = WORDS_RGX.findall(normalizeText(query))
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, db_document_id=db_document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         return ''
Example #17
0
 def notifyDownload(self, db_document_id, query):
     words = WORDS_RGX.findall(normalizeText(query))
     try:
         try:
             cursor = self._cnx.cursor()
             doc = Document.selectWhere(cursor, db_document_id=db_document_id)[0]
         finally:
             cursor.close()
         self._updateDownloadStatistics(doc, words)
         return doc.url
     except IndexError:
         return ''
 def testTitleGuess(self): #XXX: complete this with PDF/PS files before commit time !!!
     """Make sure the title is the filename when we treat a text file
        or no title could be found
     """
     title, text, links, offset = self.parser.parseFile(join(DATADIR, 'latin1.txt'), 'latin1.txt', 'ISO-8859-1')
     self.assertEquals(title, 'latin1.txt')
     self.assertEquals(normalizeText(text), "c'est l'ete")
     self.assertEquals(links, [])
     # Now, PS file
     title, text, links, offset = self.parser.parseFile(join(DATADIR, 'utf8.ps'), 'utf8.ps', 'UTF-8')
     self.assertEquals(title, 'utf8.ps')
     self.assertEquals(links, [])
     # The PDF (yes, it's important to test this too)
     title, text, links, offset = self.parser.parseFile(join(DATADIR, 'utf8.pdf'), 'utf8.pdf', 'UTF-8')
     self.assertEquals(title, 'utf8.pdf')
     self.assertEquals(links, [])
Example #19
0
 def _selectContainingQuery(cls,
                            words,
                            mimetype=None,
                            offset=0,
                            allowPrivate=False):
     words = [
         normalizeText(unicode(w)) for w in words
         if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN
     ]
     # XXX mimetype handling is a HACK. It needs to be integrated
     #     nicely in order to handle any kind of restrictions easily
     if mimetype is not None:
         restriction = " AND D.mime_type=%s "
         restrictionParams = [unicode(mimetype)]
     else:
         restriction = ""
         restrictionParams = []
     if not allowPrivate:
         restriction += " AND D.state!=%s "
         restrictionParams.append(cls.PRIVATE_STATE)
     # Question: what is the HAVING clause supposed to do ?
     # Answer: we select all documents containing one of the words
     # that we are looking for, group them by their identifier, and
     # only keep those identifiers which appeared once for each word
     # we were looking for.
     # XXX: LIMIT clause should be optional
     query = ("SELECT D.db_document_id, "
                     "D.document_id, "
                     "D.title, "
                     "D.size, "
                     "D.text, "
                     "D.url, "
                     "D.mime_type, "
                     "D.publication_time "
              "FROM documents D, document_scores DS "
              "WHERE DS.db_document_id=D.db_document_id "
              "AND DS.word IN (%s) "
              " %s "
              "GROUP BY DS.db_document_id "
              "HAVING count(DS.db_document_id) = %%s "
              "ORDER BY D.publication_time DESC "
              "LIMIT 15 OFFSET %s" % \
              (', '.join(['%s'] * len(words)), restriction, offset))
     return query, words + restrictionParams + [len(words)]
 def testTitleGuess(
         self
 ):  #XXX: complete this with PDF/PS files before commit time !!!
     """Make sure the title is the filename when we treat a text file
        or no title could be found
     """
     title, text, links, offset = self.parser.parseFile(
         join(DATADIR, 'latin1.txt'), 'latin1.txt', 'ISO-8859-1')
     self.assertEquals(title, 'latin1.txt')
     self.assertEquals(normalizeText(text), "c'est l'ete")
     self.assertEquals(links, [])
     # Now, PS file
     title, text, links, offset = self.parser.parseFile(
         join(DATADIR, 'utf8.ps'), 'utf8.ps', 'UTF-8')
     self.assertEquals(title, 'utf8.ps')
     self.assertEquals(links, [])
     # The PDF (yes, it's important to test this too)
     title, text, links, offset = self.parser.parseFile(
         join(DATADIR, 'utf8.pdf'), 'utf8.pdf', 'UTF-8')
     self.assertEquals(title, 'utf8.pdf')
     self.assertEquals(links, [])
Example #21
0
 def _selectContainingQuery(cls, words, mimetype=None, offset=0, allowPrivate=False):
     words = [normalizeText(unicode(w)) for w in words if WORD_MIN_LEN <= len(w) <= WORD_MAX_LEN]
     # XXX mimetype handling is a HACK. It needs to be integrated
     #     nicely in order to handle any kind of restrictions easily
     if mimetype is not None:
         restriction = " AND D.mime_type=%s "
         restrictionParams = [unicode(mimetype)]
     else:
         restriction = ""
         restrictionParams = []
     if not allowPrivate:
         restriction += " AND D.state!=%s "
         restrictionParams.append(cls.PRIVATE_STATE)
     # Question: what is the HAVING clause supposed to do ?
     # Answer: we select all documents containing one of the words
     # that we are looking for, group them by their identifier, and
     # only keep those identifiers which appeared once for each word
     # we were looking for.
     # XXX: LIMIT clause should be optional
     query = (
         "SELECT D.db_document_id, "
         "D.document_id, "
         "D.title, "
         "D.size, "
         "D.text, "
         "D.url, "
         "D.mime_type, "
         "D.publication_time "
         "FROM documents D, document_scores DS "
         "WHERE DS.db_document_id=D.db_document_id "
         "AND DS.word IN (%s) "
         " %s "
         "GROUP BY DS.db_document_id "
         "HAVING count(DS.db_document_id) = %%s "
         "ORDER BY D.publication_time DESC "
         "LIMIT 15 OFFSET %s" % (", ".join(["%s"] * len(words)), restriction, offset)
     )
     return query, words + restrictionParams + [len(words)]
 def testNormalizeText(self):
     text = u"À Paris,\t\x02l'été \nsera   chaud"
     norm = normalizeText(text)
     self.assertEquals(u"a paris, l'ete sera chaud", norm)
     self.assertEquals(unicode, type(norm))
 def testNormalizeText(self):
     text = u"À Paris,\t\x02l'été \nsera   chaud"
     norm = normalizeText(text)
     self.assertEquals(u"a paris, l'ete sera chaud", norm)
     self.assertEquals(unicode, type(norm))
 def testParseSimpleHtml(self):
     title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
     self.assertEquals(title, 'maille Maay')
     self.assertEquals(normalizeText(text),
                       'hello ete world this is a link and this is another link')
     self.assertEquals(links, ['something.com', 'somethingelse.com'])
 def render_nextset_url(self, context, data):
     words = WORDS_RGX.findall(normalizeText(unicode(context.arg('words'), 'utf-8')))
     offset = int(context.arg('offset', 0)) + 15
     return 'search?words=%s&offset=%s' % ('+'.join(words), offset)