def _get_best_description_location(self, descriptions): ''' Returns the first description that matches one term in the query and the location of the first match in the string. TODO: prefer a description and location which contains multiple terms next to each other. ''' desc = None location = 0 #terms = [t.lower() for t in self._get_query_terms() if t not in [u'or', u'and', u'not']] from digipal.utils import get_regexp_from_terms #terms = get_tokens_from_phrase(descriptions) terms = self._get_query_terms(True) re_terms = get_regexp_from_terms(terms) if re_terms: # search the descriptions for adesc in descriptions: m = re.search(ur'(?ui)' + re_terms, adesc.get_description_plain_text()) if m: location = m.start() desc = adesc break return desc, location
def tag_terms(value, terms=None): '''Wrap all occurrences of the terms found in value with a <span class="found-term">. Highlight terms in html. Terms is an array of words. ''' from digipal.utils import get_regexp_from_terms, get_tokens_from_phrase, remove_combining_marks, remove_accents # not nice but we have to do this for the matching below to work # we loose *some* the accents, e.g. u'r\u0305' value = remove_combining_marks(value) value_no_accent = remove_accents(value) if terms: # Surround the occurrences of those terms in the value with a span (class="found-term") # TODO: this should really be done by Whoosh instead of manually here to deal properly with special cases. # e.g. 'G*' should highlight g and the rest of the word # Here we have a simple implementation that look for exact and complete matches only. # TODO: other issue is highlight of non field values, e.g. (G.) added at the end each description # or headings. for re_term in get_regexp_from_terms(terms, True): # value = re.sub(ur'(?iu)(>[^<]*)('+re_term+ur')', ur'\1<span # class="found-term">\2</span>', u'>'+value)[1:] pos = 1 pattern = re.compile(ur'(?iu)(>[^<]*?)(' + re_term + ur')') # print re_term while True: # print value_no_accent, pos # pos-1 because we want to include the last > we've inserted in the previous loop. # without this we might miss occurrences m = pattern.search(value_no_accent, pos - 1) # print m if m: replacement = u'%s<span class="found-term">%s</span>' % ( value[m.start(1):m.end(1)], value[m.start(2):m.end(2)]) value = value[:m.start(0)] + replacement + value[m.end(0):] replacement = u'%s<span class="found-term">%s</span>' % ( value_no_accent[m.start(1):m.end(1)], value_no_accent[m.start(2):m.end(2)]) value_no_accent = value_no_accent[:m.start( 0)] + replacement + value_no_accent[m.end(0):] pos = m.start(0) + len(replacement) else: break return value
def tag_terms(value, terms=None): '''Wrap all occurrences of the terms found in value with a <span class="found-term">. Highlight terms in html. Terms is an array of words. ''' from digipal.utils import get_regexp_from_terms, get_tokens_from_phrase, remove_combining_marks, remove_accents # not nice but we have to do this for the matching below to work # we loose *some* the accents, e.g. u'r\u0305' value = remove_combining_marks(value) value_no_accent = remove_accents(value) if terms: # Surround the occurrences of those terms in the value with a span (class="found-term") # TODO: this should really be done by Whoosh instead of manually here to deal properly with special cases. # e.g. 'G*' should highlight g and the rest of the word # Here we have a simple implementation that look for exact and complete matches only. # TODO: other issue is highlight of non field values, e.g. (G.) added at the end each description # or headings. for re_term in get_regexp_from_terms(terms, True): #value = re.sub(ur'(?iu)(>[^<]*)('+re_term+ur')', ur'\1<span class="found-term">\2</span>', u'>'+value)[1:] pos = 1 pattern = re.compile(ur'(?iu)(>[^<]*?)(' + re_term + ur')') # print re_term while True: # print value_no_accent, pos # pos-1 because we want to include the last > we've inserted in the previous loop. # without this we might miss occurrences m = pattern.search(value_no_accent, pos - 1) # print m if m: replacement = u'%s<span class="found-term">%s</span>' % ( value[m.start(1):m.end(1)], value[m.start(2):m.end(2)]) value = value[:m.start(0)] + replacement + value[m.end(0):] replacement = u'%s<span class="found-term">%s</span>' % ( value_no_accent[m.start(1):m.end(1)], value_no_accent[m.start(2):m.end(2)]) value_no_accent = value_no_accent[:m.start( 0)] + replacement + value_no_accent[m.end(0):] pos = m.start(0) + len(replacement) else: break return value