Ejemplo n.º 1
0
def sort_and_rank_records(recids, so=None, rm=None, p=''):
    output = recids.tolist()
    if so:
        output.reverse()
    elif rm:
        from invenio.legacy.bibrank.record_sorter import rank_records
        ranked = rank_records(rm, 0, output, p.split())
        if ranked[0]:
            output = ranked[0]
            output.reverse()
        else:
            output = output.tolist()
    else:
        output.reverse()
    return output
def find_similar_videos(recid, collection="Videos", threshold=75, maximum=3, shuffle=True):
    """ Returns a list of similar video records
    """
    similar_records = []
    collection_recids = intbitset(perform_request_search(cc=collection))
    ranking = rank_records('wrd', 0, collection_recids, ['recid:' + str(recid)])
    ## ([6, 7], [81, 100], '(', ')', '')
    for list_pos, rank in enumerate(ranking[1]):
        if rank >= threshold:
            similar_records.append(ranking[0][list_pos])
    if shuffle:
        if maximum > len(similar_records):
            maximum = len(similar_records)
        return random.sample(similar_records, maximum)
    else:
        return similar_records[:maximum]
Ejemplo n.º 3
0
def find_similar_videos(recid,
                        collection="Videos",
                        threshold=75,
                        maximum=3,
                        shuffle=True):
    """ Returns a list of similar video records
    """
    similar_records = []
    collection_recids = intbitset(perform_request_search(cc=collection))
    ranking = rank_records('wrd', 0, collection_recids,
                           ['recid:' + str(recid)])
    ## ([6, 7], [81, 100], '(', ')', '')
    for list_pos, rank in enumerate(ranking[1]):
        if rank >= threshold:
            similar_records.append(ranking[0][list_pos])
    if shuffle:
        if maximum > len(similar_records):
            maximum = len(similar_records)
        return random.sample(similar_records, maximum)
    else:
        return similar_records[:maximum]
Ejemplo n.º 4
0
def sort_records_bibsort(recIDs,
                         sort_method,
                         sort_field='',
                         sort_order='d',
                         rg=None,
                         jrec=1,
                         sort_or_rank='s',
                         sorting_methods=SORTING_METHODS):
    """Order the list based on a sorting method using the BibSortDataCacher."""
    sorting_methods = sorting_methods or SORTING_METHODS

    if not jrec:
        jrec = 1

    # sanity check
    if sort_method not in sorting_methods:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field, sort_order,
                                       '', rg, jrec)

    # we should return sorted records up to irec_max(exclusive)
    dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
    solution = intbitset()
    input_recids = intbitset(recIDs)
    CACHE_SORTED_DATA[sort_method].recreate_cache_if_needed()
    sort_cache = CACHE_SORTED_DATA[sort_method].cache
    bucket_numbers = sort_cache['bucket_data'].keys()
    # check if all buckets have been constructed
    if len(bucket_numbers) != cfg['CFG_BIBSORT_BUCKETS']:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs,
                                       None,
                                       sort_field,
                                       sort_order,
                                       rg=rg,
                                       jrec=jrec)

    if sort_order == 'd':
        bucket_numbers.reverse()
    for bucket_no in bucket_numbers:
        solution.union_update(input_recids
                              & sort_cache['bucket_data'][bucket_no])
        if len(solution) >= irec_max:
            break

    dict_solution = {}
    missing_records = intbitset()
    for recid in solution:
        try:
            dict_solution[recid] = sort_cache['data_dict_ordered'][recid]
        except KeyError:
            # recid is in buckets, but not in the bsrMETHODDATA,
            # maybe because the value has been deleted, but the change has not
            # yet been propagated to the buckets
            missing_records.add(recid)

    # check if there are recids that are not in any bucket -> to be added at
    # the end/top, ordered by insertion date
    if len(solution) < irec_max:
        # some records have not been yet inserted in the bibsort structures
        # or, some records have no value for the sort_method
        missing_records += input_recids - solution

    reverse = sort_order == 'd'

    if sort_method.strip().lower() == cfg['CFG_BIBSORT_DEFAULT_FIELD'] and \
            reverse:
        # If we want to sort the records on their insertion date, add the
        # missing records at the top.
        solution = sorted(missing_records, reverse=True) + \
            sorted(dict_solution, key=dict_solution.__getitem__, reverse=True)
    else:
        solution = sorted(dict_solution,
                          key=dict_solution.__getitem__,
                          reverse=reverse) + sorted(missing_records)

    # Only keep records, we are going to display
    solution = slice_records(solution, jrec, rg)

    if sort_or_rank == 'r':
        # We need the recids, with their ranking score
        return solution, [dict_solution.get(record, 0) for record in solution]
    else:
        return solution
Ejemplo n.º 5
0
def sort_records_bibsort(recIDs, sort_method, sort_field='', sort_order='d',
                         rg=None, jrec=1, sort_or_rank='s',
                         sorting_methods=SORTING_METHODS):
    """Order the list based on a sorting method using the BibSortDataCacher."""
    sorting_methods = sorting_methods or SORTING_METHODS

    if not jrec:
        jrec = 1

    # sanity check
    if sort_method not in sorting_methods:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field, sort_order,
                                       '', rg, jrec)

    # we should return sorted records up to irec_max(exclusive)
    dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
    solution = intbitset()
    input_recids = intbitset(recIDs)
    CACHE_SORTED_DATA[sort_method].recreate_cache_if_needed()
    sort_cache = CACHE_SORTED_DATA[sort_method].cache
    bucket_numbers = sort_cache['bucket_data'].keys()
    # check if all buckets have been constructed
    if len(bucket_numbers) != cfg['CFG_BIBSORT_BUCKETS']:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0, hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field,
                                       sort_order, rg=rg, jrec=jrec)

    if sort_order == 'd':
        bucket_numbers.reverse()
    for bucket_no in bucket_numbers:
        solution.union_update(
            input_recids & sort_cache['bucket_data'][bucket_no]
        )
        if len(solution) >= irec_max:
            break

    dict_solution = {}
    missing_records = intbitset()
    for recid in solution:
        try:
            dict_solution[recid] = sort_cache['data_dict_ordered'][recid]
        except KeyError:
            # recid is in buckets, but not in the bsrMETHODDATA,
            # maybe because the value has been deleted, but the change has not
            # yet been propagated to the buckets
            missing_records.add(recid)

    # check if there are recids that are not in any bucket -> to be added at
    # the end/top, ordered by insertion date
    if len(solution) < irec_max:
        # some records have not been yet inserted in the bibsort structures
        # or, some records have no value for the sort_method
        missing_records += input_recids - solution

    reverse = sort_order == 'd'

    if sort_method.strip().lower() == cfg['CFG_BIBSORT_DEFAULT_FIELD'] and \
            reverse:
        # If we want to sort the records on their insertion date, add the
        # missing records at the top.
        solution = sorted(missing_records, reverse=True) + \
            sorted(dict_solution, key=dict_solution.__getitem__, reverse=True)
    else:
        solution = sorted(dict_solution, key=dict_solution.__getitem__,
                          reverse=reverse) + sorted(missing_records)

    # Only keep records, we are going to display
    solution = slice_records(solution, jrec, rg)

    if sort_or_rank == 'r':
        # We need the recids, with their ranking score
        return solution, [dict_solution.get(record, 0) for record in solution]
    else:
        return solution