Python get_regexp_from_terms Exemples, digipal.utils.get_regexp_from_terms Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : search_manuscripts.py Projet : MCadeStewart/digipal

    def _get_best_description_location(self, descriptions):
        '''
            Returns the first description that matches one term in the query
            and the location of the first match in the string.

            TODO: prefer a description and location which contains multiple terms
                next to each other.
        '''
        desc = None
        location = 0
        #terms = [t.lower() for t in self._get_query_terms() if t not in [u'or', u'and', u'not']]
        from digipal.utils import get_regexp_from_terms
        #terms = get_tokens_from_phrase(descriptions)
        terms = self._get_query_terms(True)
        re_terms = get_regexp_from_terms(terms)

        if re_terms:
            # search the descriptions
            for adesc in descriptions:
                m = re.search(ur'(?ui)' + re_terms, adesc.get_description_plain_text())
                if m:
                    location = m.start()
                    desc = adesc
                    break
        return desc, location

Exemple #2

0

Afficher le fichier

Fichier : search_manuscripts.py Projet : suzypiat/digipal

    def _get_best_description_location(self, descriptions):
        '''
            Returns the first description that matches one term in the query
            and the location of the first match in the string.

            TODO: prefer a description and location which contains multiple terms
                next to each other.
        '''
        desc = None
        location = 0
        #terms = [t.lower() for t in self._get_query_terms() if t not in [u'or', u'and', u'not']]
        from digipal.utils import get_regexp_from_terms
        #terms = get_tokens_from_phrase(descriptions)
        terms = self._get_query_terms(True)
        re_terms = get_regexp_from_terms(terms)

        if re_terms:
            # search the descriptions
            for adesc in descriptions:
                m = re.search(ur'(?ui)' + re_terms, adesc.get_description_plain_text())
                if m:
                    location = m.start()
                    desc = adesc
                    break
        return desc, location

Exemple #3

0

Afficher le fichier

Fichier : html_escape.py Projet : suzypiat/digipal

def tag_terms(value, terms=None):
    '''Wrap all occurrences of the terms found in value with a <span class="found-term">.
        Highlight terms in html.
        Terms is an array of words.
    '''
    from digipal.utils import get_regexp_from_terms, get_tokens_from_phrase, remove_combining_marks, remove_accents

    # not nice but we have to do this for the matching below to work
    # we loose *some* the accents, e.g. u'r\u0305'
    value = remove_combining_marks(value)
    value_no_accent = remove_accents(value)

    if terms:
        # Surround the occurrences of those terms in the value with a span (class="found-term")
        # TODO: this should really be done by Whoosh instead of manually here to deal properly with special cases.
        # e.g. 'G*' should highlight g and the rest of the word
        # Here we have a simple implementation that look for exact and complete matches only.
        # TODO: other issue is highlight of non field values, e.g. (G.) added at the end each description
        #         or headings.
        for re_term in get_regexp_from_terms(terms, True):
            # value = re.sub(ur'(?iu)(>[^<]*)('+re_term+ur')', ur'\1<span
            # class="found-term">\2</span>', u'>'+value)[1:]
            pos = 1
            pattern = re.compile(ur'(?iu)(>[^<]*?)(' + re_term + ur')')
            # print re_term
            while True:
                # print value_no_accent, pos
                # pos-1 because we want to include the last > we've inserted in the previous loop.
                # without this we might miss occurrences
                m = pattern.search(value_no_accent, pos - 1)
                # print m
                if m:
                    replacement = u'%s<span class="found-term">%s</span>' % (
                        value[m.start(1):m.end(1)], value[m.start(2):m.end(2)])

                    value = value[:m.start(0)] + replacement + value[m.end(0):]

                    replacement = u'%s<span class="found-term">%s</span>' % (
                        value_no_accent[m.start(1):m.end(1)],
                        value_no_accent[m.start(2):m.end(2)])
                    value_no_accent = value_no_accent[:m.start(
                        0)] + replacement + value_no_accent[m.end(0):]

                    pos = m.start(0) + len(replacement)
                else:
                    break

    return value

Exemple #4

0

Afficher le fichier

Fichier : html_escape.py Projet : kcl-ddh/digipal

def tag_terms(value, terms=None):
    '''Wrap all occurrences of the terms found in value with a <span class="found-term">.
        Highlight terms in html.
        Terms is an array of words.
    '''
    from digipal.utils import get_regexp_from_terms, get_tokens_from_phrase, remove_combining_marks, remove_accents

    # not nice but we have to do this for the matching below to work
    # we loose *some* the accents, e.g. u'r\u0305'
    value = remove_combining_marks(value)
    value_no_accent = remove_accents(value)

    if terms:
        # Surround the occurrences of those terms in the value with a span (class="found-term")
        # TODO: this should really be done by Whoosh instead of manually here to deal properly with special cases.
        # e.g. 'G*' should highlight g and the rest of the word
        # Here we have a simple implementation that look for exact and complete matches only.
        # TODO: other issue is highlight of non field values, e.g. (G.) added at the end each description
        #         or headings.
        for re_term in get_regexp_from_terms(terms, True):
            #value = re.sub(ur'(?iu)(>[^<]*)('+re_term+ur')', ur'\1<span class="found-term">\2</span>', u'>'+value)[1:]
            pos = 1
            pattern = re.compile(ur'(?iu)(>[^<]*?)(' + re_term + ur')')
            # print re_term
            while True:
                # print value_no_accent, pos
                # pos-1 because we want to include the last > we've inserted in the previous loop.
                # without this we might miss occurrences
                m = pattern.search(value_no_accent, pos - 1)
                # print m
                if m:
                    replacement = u'%s<span class="found-term">%s</span>' % (
                        value[m.start(1):m.end(1)], value[m.start(2):m.end(2)])

                    value = value[:m.start(0)] + replacement + value[m.end(0):]

                    replacement = u'%s<span class="found-term">%s</span>' % (
                        value_no_accent[m.start(1):m.end(1)], value_no_accent[m.start(2):m.end(2)])
                    value_no_accent = value_no_accent[:m.start(
                        0)] + replacement + value_no_accent[m.end(0):]

                    pos = m.start(0) + len(replacement)
                else:
                    break

    return value