Beispiel #1
0
def similarity(query, string):
    """
    Calculate the match for the given `query` and `string`.

    The match is calculated using the `jaro winkler` for each set of the matrix
    (`query` x `string`) and takes into consideration the position difference
    into the strings.

    Arguments:
        query (str): search query
        string (str): string to test against

    Returns:
        float: normalized value indicating the probability of match, where
        0 means completely dissimilar and 1 means equal.
    """

    # split the two strings cleaning out some stuff
    query = utils.tokenize(query.lower())
    string = utils.tokenize(string.lower())

    # if one of the two strings is falsy (no content, or was passed with items
    # short enough to be trimmed out), return 0 here to avoid ZeroDivisionError
    # later on while processing.
    if len(query) == 0 or len(string) == 0:
        return 0

    shortest, longest = sorted((query, string), key=lambda x: len(x))

    # matrix of tuples for each segment of both query and string
    matrix = [(s1, s2) for s1 in longest for s2 in shortest]

    matches = {}
    for string1, string2 in matrix:
        # get the jaro winkler equality between the two strings
        match = jf.jaro_winkler(string1, string2)
        # calculate the distance factor for the position of the segments
        # on their respective lists
        positional = utils.position_similarity(
            string1, string2, longest, shortest)

        # get them together and append to the matches dictionary
        match = (match, positional)
        matches.setdefault(string1, []).append(match)

    # get the highest value for each list, the apply the word-distance factor
    # the key takes the jaro winkler distance value to get the max value
    matches = [max(m, key=lambda x: x[0]) for m in matches.values()]
    _weights = (config.MATCH_WEIGHT, config.DIST_WEIGHT)
    matches = [utils.weighted_average((m, d), _weights) for m, d in matches]

    # get the weighted mean for all the highest matches and apply the highest
    # match value found as coefficient as multiplier, to add weights to more
    # coherent matches.
    mean_match = (sum(matches) / len(matches)) * max(matches)
    return mean_match
Beispiel #2
0
def search_for(request):
    if 'q' not in request.GET:
        return HttpResponseRedirect('/')

    keywords = tokenize(request.GET['q'])
    if not keywords:
        return HttpResponseRedirect('/')

    pages = WikiPage.objects

    selected_lists = []
    # filter out lists if requested
    if 'lists' in request.POST:
        if request.POST.get('lists'):
            selected_lists = request.POST.get('lists').split(',')
            pages = pages.filter(lists__url_name__in=selected_lists)

    selected_categories = []
    # filter out categories if requested
    if 'categories' in request.POST:
        if request.POST.get('categories'):
            selected_categories = request.POST.get('categories').split(',')
            pages = pages.filter(categories__url_name__in=selected_categories)

    # filter out pages that do not match all the keywords
    for keyword in keywords:
        pages = pages.filter(pagekeyword__keyword=keyword)

    # use subquery to calculate the weight of result
    pages = pages.extra(select={
        'weight': 'SELECT SUM(count) ' +
                  'FROM search_pagekeyword ' +
                  'WHERE search_pagekeyword.page_id = ' +
                  'wikipage_wikipage.url_name AND ' +
                  'keyword in %s'},
        select_params=(tuple(keywords),),
    ).order_by('-weight')

    selected_lists_objects = \
        WikiList.objects.filter(url_name__in=selected_lists)
    selected_cates_objects = \
        WikiCategory.objects.filter(url_name__in=selected_categories)

    context = {
        'pages': pages.all(),
        'lists': WikiList.objects.order_by('title').all(),
        'selected_lists': selected_lists_objects,
        'categories': WikiCategory.objects.order_by('title').all(),
        'selected_cates': selected_cates_objects,
        'keyword': request.GET['q'],
    }

    template = 'search/results.html'
    if request.is_ajax():
        template = 'search/results_items.html'

    return render(request, template, context)
Beispiel #3
0
def parse_page(url_name):
    logger.info('Parsing ' + url_name)

    page = WikiPage.objects.get(url_name=url_name)

    # remove noisy info on the page
    soup = BeautifulSoup(page.body, 'html.parser')
    for class_name in ('reflist', 'citation',
                       'navbox', 'noprint',
                       'reference', 'vertical-navbox'):
        for item in soup.find_all(class_=class_name):
            item.extract()
    soup.find(id='siteSub').extract()
    soup = BeautifulSoup(str(soup), 'html.parser')

    # count and save
    tokens = tokenize(soup.get_text())
    counts, total = Counter(tokens), len(tokens)
    title_tokens = tokenize(page.title)
    t_counts, t_total = Counter(title_tokens), len(title_tokens)

    for keyword in counts:
        key = PageKeyword()
        key.keyword = keyword

        # Term Frequency: n / sum + tn / t_sum
        key.count = counts[keyword] / total
        if keyword in t_counts:
            key.count += t_counts[keyword] / t_total

        key.page = page
        key.save()

    for keyword in t_counts:
        if keyword in counts:
            continue

        key = PageKeyword()
        key.keyword = keyword
        key.count = t_counts[keyword] / t_total
        key.page = page
        key.save()