def test_get_terms(self): test_data = ( (u'Arachne', [u'arachne']), (u'arachne1.0', [u'arachne', u'1.0']), (u'Yasser González Fernández', [u'yasser', u'gonzález', u'gonzalez', u'fernández', u'fernandez']), (u'Python-3.0rc1.tar.bz2', [u'python', u'3.0', u'1', u'tar', u'2']), (u'07. (Let me be your) Teddy bear.mp3', [u'let', u'your', u'teddy', u'bear', u'3']), (u'dive_into_python.zip', [u'dive', u'into', u'python', u'zip']), (u'AFewCamelCasedWords', [u'afewcamelcasedwords', u'few', u'camel', u'cased', u'words']), (u'It should ignore this: ! # &.', [u'should', u'ignore', u'this']), (u'Please, please me', [u'please']), (u'/Books/Programming/Python/dive_into_python', [u'books', u'programming', u'python', u'dive', u'into']), (u'/Music/The Beatles/Meet The Beatles!', [u'music', u'the', u'beatles', u'meet']), (u'The C Programming Language', [u'the', u'c', u'programming', u'language']), ) for basename, right_terms in test_data: terms = IndexProcessor.get_terms(basename) for term in terms: self.assertTrue(term in right_terms, term) right_terms.remove(term) self.assertEquals(len(right_terms), 0)
def test_get_terms(self): test_data = ( (u"Arachne", [u"arachne"]), (u"arachne1.0", [u"arachne", u"1.0"]), (u"Yasser González Fernández", [u"yasser", u"gonzález", u"gonzalez", u"fernández", u"fernandez"]), (u"Python-3.0rc1.tar.bz2", [u"python", u"3.0", u"1", u"tar", u"2"]), (u"07. (Let me be your) Teddy bear.mp3", [u"let", u"your", u"teddy", u"bear", u"3"]), (u"dive_into_python.zip", [u"dive", u"into", u"python", u"zip"]), (u"AFewCamelCasedWords", [u"afewcamelcasedwords", u"few", u"camel", u"cased", u"words"]), (u"It should ignore this: ! # &.", [u"should", u"ignore", u"this"]), (u"Please, please me", [u"please"]), (u"/Books/Programming/Python/dive_into_python", [u"books", u"programming", u"python", u"dive", u"into"]), (u"/Music/The Beatles/Meet The Beatles!", [u"music", u"the", u"beatles", u"meet"]), (u"The C Programming Language", [u"the", u"c", u"programming", u"language"]), ) for basename, right_terms in test_data: terms = IndexProcessor.get_terms(basename) for term in terms: self.assertTrue(term in right_terms, term) right_terms.remove(term) self.assertEquals(len(right_terms), 0)
def _parse_query(self, query, site_ids, filetype): """Parse the query string and return a Xapian query. """ # Parse the query string. plus_terms = set() minus_terms = set() normal_terms = set() for query_term in query.split(): query_term = query_term.strip() if query_term.startswith('+'): query_term = query_term[1:] if query_term: plus_terms.update(IndexProcessor.get_terms(query_term)) elif query_term.startswith('-'): query_term = query_term[1:] if query_term: minus_terms.update(IndexProcessor.get_terms(query_term)) else: if query_term: normal_terms.update(IndexProcessor.get_terms(query_term)) # Build the queries for plus, minus and normal terms. if plus_terms: plus_terms = [IndexProcessor.BASENAME_PREFIX + plus_term for plus_term in plus_terms] plus_query = xapian.Query(xapian.Query.OP_AND, plus_terms) else: plus_query = None if minus_terms: minus_terms = [IndexProcessor.BASENAME_PREFIX + minus_term for minus_term in minus_terms] minus_query = xapian.Query(xapian.Query.OP_OR, minus_terms) else: minus_query = None if normal_terms: content_terms = [IndexProcessor.CONTENT_PREFIX + normal_term for normal_term in normal_terms] content_query = xapian.Query(xapian.Query.OP_OR, content_terms) content_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT, content_query, 20) basename_terms = [IndexProcessor.BASENAME_PREFIX + normal_term for normal_term in normal_terms] basename_query = xapian.Query(xapian.Query.OP_OR, basename_terms) basename_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT, basename_query, 10) dirname_terms = [IndexProcessor.DIRNAME_PREFIX + normal_term for normal_term in normal_terms] dirname_query = xapian.Query(xapian.Query.OP_OR, dirname_terms) dirname_query = xapian.Query(xapian.Query.OP_SCALE_WEIGHT, dirname_query, 2) normal_query = xapian.Query(xapian.Query.OP_OR, basename_query, dirname_query) normal_query = xapian.Query(xapian.Query.OP_OR, normal_query, content_query) else: normal_query = None # Stem normal terms. stemmed_terms = set() for term in normal_terms: for stemmer in self._stemmers: stemmed_terms.add(stemmer(term).decode('utf-8')) # Build the query for the stemmed terms. if stemmed_terms: stemmed_terms = [IndexProcessor.STEM_PREFIX + stemmed_term for stemmed_term in stemmed_terms] stemmed_query = xapian.Query(xapian.Query.OP_OR, stemmed_terms) else: stemmed_query = None # Build the query for the given filetype. if filetype == self.SEARCH_FILE: filetype_query = xapian.Query(IndexProcessor.IS_DIR_PREFIX + IndexProcessor.FALSE_VALUE) elif filetype == self.SEARCH_DIRECTORY: filetype_query = xapian.Query(IndexProcessor.IS_DIR_PREFIX + IndexProcessor.TRUE_VALUE) else: filetype_query = None # Build the query for the sites. if site_ids: site_ids_terms = [IndexProcessor.SITE_ID_PREFIX + site_id for site_id in site_ids] site_ids_query = xapian.Query(xapian.Query.OP_OR, site_ids_terms) else: site_ids_query = None # Build the final query from the sub-queries. query = None if plus_query: query = plus_query if normal_query: common_query = xapian.Query(xapian.Query.OP_OR, normal_query, stemmed_query) if query is not None: query = xapian.Query(xapian.Query.OP_AND_MAYBE, query, common_query) else: query = common_query if minus_query: if query is not None: query = xapian.Query(xapian.Query.OP_AND_NOT, query, minus_query) else: query = xapian.Query(xapian.Query.OP_AND_NOT, xapian.Query(''), minus_query) # Query without terms? Return a query that generate an empty MSet. if query is None: query = xapian.Query() else: # Apply filters for site and filetype. if site_ids_query: query = xapian.Query(xapian.Query.OP_FILTER, query, site_ids_query) if filetype_query: query = xapian.Query(xapian.Query.OP_FILTER, query, filetype_query) return query