예제 #1
0
 def test_get_terms(self):
     test_data = (
         (u'Arachne', [u'arachne']),
         (u'arachne1.0', [u'arachne', u'1.0']),
         (u'Yasser González Fernández',
          [u'yasser', u'gonzález', u'gonzalez', u'fernández',
           u'fernandez']),
         (u'Python-3.0rc1.tar.bz2', [u'python', u'3.0', u'1', u'tar',
                                     u'2']),
         (u'07. (Let me be your) Teddy bear.mp3',
          [u'let', u'your', u'teddy', u'bear', u'3']),
         (u'dive_into_python.zip', [u'dive', u'into', u'python', u'zip']),
         (u'AFewCamelCasedWords',
          [u'afewcamelcasedwords', u'few', u'camel', u'cased', u'words']),
         (u'It should ignore this: ! # &.', [u'should', u'ignore',
                                             u'this']),
         (u'Please, please me', [u'please']),
         (u'/Books/Programming/Python/dive_into_python',
          [u'books', u'programming', u'python', u'dive', u'into']),
         (u'/Music/The Beatles/Meet The Beatles!',
          [u'music', u'the', u'beatles', u'meet']),
         (u'The C Programming Language',
          [u'the', u'c', u'programming', u'language']),
     )
     for basename, right_terms in test_data:
         terms = IndexProcessor.get_terms(basename)
         for term in terms:
             self.assertTrue(term in right_terms, term)
             right_terms.remove(term)
         self.assertEquals(len(right_terms), 0)
예제 #2
0
    def test_get_terms(self):
        test_data = (
            (u'Arachne',
             [u'arachne']),

            (u'arachne1.0',
             [u'arachne', u'1.0']),

            (u'Yasser González Fernández',
             [u'yasser', u'gonzález', u'gonzalez', u'fernández', u'fernandez']),

            (u'Python-3.0rc1.tar.bz2',
             [u'python', u'3.0', u'1', u'tar', u'2']),

            (u'07. (Let me be your) Teddy bear.mp3',
             [u'let', u'your', u'teddy', u'bear', u'3']),

            (u'dive_into_python.zip',
             [u'dive', u'into', u'python', u'zip']),

            (u'AFewCamelCasedWords',
             [u'afewcamelcasedwords', u'few', u'camel', u'cased', u'words']),

            (u'It should ignore this: ! # &.',
             [u'should', u'ignore', u'this']),

            (u'Please, please me',
             [u'please']),

            (u'/Books/Programming/Python/dive_into_python',
             [u'books', u'programming', u'python', u'dive', u'into']),

            (u'/Music/The Beatles/Meet The Beatles!',
             [u'music', u'the', u'beatles', u'meet']),

            (u'The C Programming Language',
             [u'the', u'c', u'programming', u'language']),
        )
        for basename, right_terms in test_data:
            terms = IndexProcessor.get_terms(basename)
            for term in terms:
                self.assertTrue(term in right_terms, term)
                right_terms.remove(term)
            self.assertEquals(len(right_terms), 0)
예제 #3
0
 def test_get_terms(self):
     test_data = (
         (u"Arachne", [u"arachne"]),
         (u"arachne1.0", [u"arachne", u"1.0"]),
         (u"Yasser González Fernández", [u"yasser", u"gonzález", u"gonzalez", u"fernández", u"fernandez"]),
         (u"Python-3.0rc1.tar.bz2", [u"python", u"3.0", u"1", u"tar", u"2"]),
         (u"07. (Let me be your) Teddy bear.mp3", [u"let", u"your", u"teddy", u"bear", u"3"]),
         (u"dive_into_python.zip", [u"dive", u"into", u"python", u"zip"]),
         (u"AFewCamelCasedWords", [u"afewcamelcasedwords", u"few", u"camel", u"cased", u"words"]),
         (u"It should ignore this: ! # &.", [u"should", u"ignore", u"this"]),
         (u"Please, please me", [u"please"]),
         (u"/Books/Programming/Python/dive_into_python", [u"books", u"programming", u"python", u"dive", u"into"]),
         (u"/Music/The Beatles/Meet The Beatles!", [u"music", u"the", u"beatles", u"meet"]),
         (u"The C Programming Language", [u"the", u"c", u"programming", u"language"]),
     )
     for basename, right_terms in test_data:
         terms = IndexProcessor.get_terms(basename)
         for term in terms:
             self.assertTrue(term in right_terms, term)
             right_terms.remove(term)
         self.assertEquals(len(right_terms), 0)
예제 #4
0
 def _parse_query(self, query, site_ids, filetype):
     """Parse the query string and return a Xapian query.
     """
     # Parse the query string.
     plus_terms = set()
     minus_terms = set()
     normal_terms = set()
     for query_term in query.split():
         query_term = query_term.strip()
         if query_term.startswith('+'):
             query_term = query_term[1:]
             if query_term:
                 plus_terms.update(IndexProcessor.get_terms(query_term))
         elif query_term.startswith('-'):
             query_term = query_term[1:]
             if query_term:
                 minus_terms.update(IndexProcessor.get_terms(query_term))
         else:
             if query_term:
                 normal_terms.update(IndexProcessor.get_terms(query_term))
     # Build the queries for plus, minus and normal terms.
     if plus_terms:
         plus_terms = [IndexProcessor.BASENAME_PREFIX + plus_term
                       for plus_term in plus_terms]
         plus_query = xapian.Query(xapian.Query.OP_AND, plus_terms)
     else:
         plus_query = None
     if minus_terms:
         minus_terms = [IndexProcessor.BASENAME_PREFIX + minus_term
                        for minus_term in minus_terms]
         minus_query = xapian.Query(xapian.Query.OP_OR, minus_terms)
     else:
         minus_query = None
     if normal_terms:
         content_terms = [IndexProcessor.CONTENT_PREFIX + normal_term
                          for normal_term in normal_terms]
         content_query = xapian.Query(xapian.Query.OP_OR, content_terms)
         content_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT,
                                      content_query, 20)
         basename_terms = [IndexProcessor.BASENAME_PREFIX + normal_term
                           for normal_term in normal_terms]
         basename_query = xapian.Query(xapian.Query.OP_OR, basename_terms)
         basename_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT,
                                       basename_query, 10)
         dirname_terms = [IndexProcessor.DIRNAME_PREFIX + normal_term
                          for normal_term in normal_terms]
         dirname_query = xapian.Query(xapian.Query.OP_OR, dirname_terms)
         dirname_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT,
                                      dirname_query, 2)
         normal_query = xapian.Query(xapian.Query.OP_OR, basename_query,
                                     dirname_query)
         normal_query = xapian.Query(xapian.Query.OP_OR, normal_query,
                                     content_query)
     else:
         normal_query = None
     # Stem normal terms.
     stemmed_terms = set()
     for term in normal_terms:
         for stemmer in self._stemmers:
             stemmed_terms.add(stemmer(term).decode('utf-8'))
     # Build the query for the stemmed terms.
     if stemmed_terms:
         stemmed_terms = [IndexProcessor.STEM_PREFIX + stemmed_term
                          for stemmed_term in stemmed_terms]
         stemmed_query = xapian.Query(xapian.Query.OP_OR, stemmed_terms)
     else:
         stemmed_query = None
     # Build the query for the given filetype.
     if filetype == self.SEARCH_FILE:
         filetype_query = xapian.Query(IndexProcessor.IS_DIR_PREFIX
                                       + IndexProcessor.FALSE_VALUE)
     elif filetype == self.SEARCH_DIRECTORY:
         filetype_query = xapian.Query(IndexProcessor.IS_DIR_PREFIX
                                       + IndexProcessor.TRUE_VALUE)
     else:
         filetype_query = None
     # Build the query for the sites.
     if site_ids:
         site_ids_terms = [IndexProcessor.SITE_ID_PREFIX + site_id
                           for site_id in site_ids]
         site_ids_query = xapian.Query(xapian.Query.OP_OR, site_ids_terms)
     else:
         site_ids_query = None
     # Build the final query from the sub-queries.
     query = None
     if plus_query:
         query = plus_query
     if normal_query:
         common_query = xapian.Query(xapian.Query.OP_OR,
                                     normal_query, stemmed_query)
         if query is not None:
             query = xapian.Query(xapian.Query.OP_AND_MAYBE,
                                  query, common_query)
         else:
             query = common_query
     if minus_query:
         if query is not None:
             query = xapian.Query(xapian.Query.OP_AND_NOT,
                                  query, minus_query)
         else:
             query = xapian.Query(xapian.Query.OP_AND_NOT,
                                  xapian.Query(''), minus_query)
     # Query without terms? Return a query that generate an empty MSet.
     if query is None:
         query = xapian.Query()
     else:
         # Apply filters for site and filetype.
         if site_ids_query:
             query = xapian.Query(xapian.Query.OP_FILTER,
                                  query, site_ids_query)
         if filetype_query:
             query = xapian.Query(xapian.Query.OP_FILTER,
                                  query, filetype_query)
     return query