def process_hits(self, filter_nb):
        last_hits = SearchHit.objects.all()

        processed_hits = []

        for hit in last_hits:
            query = hit.query

            # blacklist
            if query in config.HISTORY_BLACKLIST:
                continue

            if hit.nb_results < filter_nb:
                SearchHitHistoric(query=hit.query,
                                  nb_results=hit.nb_results,
                                  date=hit.date).save()
                hit.delete()
                continue

            # manual get_or_create
            try:
                search_query = SearchQuery.objects.get(query=query)
                created = False
            except SearchQuery.DoesNotExist:
                search_query = SearchQuery(query=query)
                created = True

            # if it's a new one, initialize it
            if created:
                search_query.phonex = phonex(query)

                # clean the query, the '_' char cause bugy clean_query
                query = query.replace('_', '')

                lems = lemmatize(query.split())

                clean_query = [lem for lem in lems if lem]
                clean_query = ' '.join(clean_query)

                clean_phonex = phonex(clean_query)

                search_query.clean_query = clean_query
                search_query.clean_phonex = clean_phonex

                search_query.nb_total_search = 0
                search_query.pondered_search_nb = 0
                search_query.nb_recent_search = 0

            search_query.nb_results = hit.nb_results
            search_query.nb_total_search += 1

            search_query.pondered_search_nb += 1
            search_query.nb_recent_search += 1

            weight = (search_query.pondered_search_nb * config.HISTORY_BETA +
                      search_query.nb_results * config.HISTORY_GAMMA)
            search_query.weight = weight
            search_query.save()

            # we can now create SearchHitHistoric
            SearchHitHistoric(query=hit.query,
                              nb_results=hit.nb_results,
                              date=hit.date).save()

            hit.delete()
Esempio n. 2
0
    def process_hits(self, filter_nb):
        last_hits = SearchHit.objects.all()

        processed_hits = []

        for hit in last_hits:
            query = hit.query

            # blacklist
            if query in config.HISTORY_BLACKLIST:
                continue

            if hit.nb_results < filter_nb:
                SearchHitHistoric(query=hit.query,
                                  nb_results=hit.nb_results,
                                  date=hit.date).save()
                hit.delete()
                continue

            # manual get_or_create
            try:
                search_query = SearchQuery.objects.get(query=query)
                created = False
            except SearchQuery.DoesNotExist:
                search_query = SearchQuery(query=query)
                created = True

            # if it's a new one, initialize it
            if created:
                search_query.phonex = phonex(query)

                # clean the query, the '_' char cause bugy clean_query
                query = query.replace('_', '')

                lems = lemmatize(query.split())

                clean_query = [lem for lem in lems if lem]
                clean_query = ' '.join(clean_query)

                clean_phonex = phonex(clean_query)

                search_query.clean_query = clean_query
                search_query.clean_phonex = clean_phonex

                search_query.nb_total_search = 0
                search_query.pondered_search_nb = 0
                search_query.nb_recent_search = 0

            search_query.nb_results = hit.nb_results
            search_query.nb_total_search += 1

            search_query.pondered_search_nb += 1
            search_query.nb_recent_search += 1

            weight = (search_query.pondered_search_nb * config.HISTORY_BETA +
                      search_query.nb_results * config.HISTORY_GAMMA)
            search_query.weight = weight
            search_query.save()

            # we can now create SearchHitHistoric
            SearchHitHistoric(query=hit.query,
                              nb_results=hit.nb_results,
                              date=hit.date).save()

            hit.delete()
Esempio n. 3
0
def highlight(text, words, index = None):
    """
    Give the position of words in a text, cleaning everything as sesql does
    That can be used to highlight the words, for example
    The index will be use to lemmatize, if none, it'll use the default one
    """
    if not text:
        return []
    
    if index is None:
        index = fieldmap.primary

    if index is None:
        raise ValueError, "Not index given and no primary one"

    size = len(text)
    letters = set(string.ascii_letters)
    
    # Lemmatize the words
    lems = lemmatize(words, index)

    # Marshall everything
    text = index.marshall(text, use_cleanup = False)

    # Now find the lemmatized words inside the text
    found = []
    foundwords = set()
    for i, lem in enumerate(lems):
        if not lem:
            continue
        wordsize = len(lem)
        pos = 0
        while True:
            begin = text.find(lem, pos)
            if begin < 0:
                break
            end = begin + wordsize

            # We found something, ensure it's a normal word
            if begin and text[begin - 1] in letters:
                pos = end
                continue

            # Now find the end of the word
            while end < size and text[end] in letters:
                end += 1

            found.append((begin, end, i))
            foundwords.add(text[begin:end])
            pos = end

    # Lemmatize all found words
    foundwords = list(foundwords)
    foundlems = lemmatize(foundwords, index)
    foundlems = dict(zip(foundwords, foundlems))

    # And now, second pass, ensure lemmatized version of word is word
    results = []
    for begin, end, i in found:
        word = text[begin:end]
        lem = foundlems[word]
        wanted_lem = lems[i]
        if lem == wanted_lem:
            results.append((begin, end, i))

    return results