Ejemplo n.º 1
0
        def fill():
            alldicts = {}
            from invenio.legacy.bibrank.tag_based_indexer import fromDB

            serialized_weights = cache.get("citations_weights")
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB("citation")

            alldicts["citations_weights"] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts["citations_keys"] = intbitset(weights.keys())

            # Citation counts
            alldicts["citations_counts"] = [t for t in iteritems(weights)]
            alldicts["citations_counts"].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = cache.get("selfcites_weights")
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB("selfcites")
            selfcites_weights = {}
            for recid, counts in alldicts["citations_counts"]:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts["selfcites_weights"] = selfcites_weights
            alldicts["selfcites_counts"] = [
                (recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts["citations_counts"]
            ]
            alldicts["selfcites_counts"].sort(key=itemgetter(1), reverse=True)

            return alldicts
Ejemplo n.º 2
0
        def fill():
            alldicts = {}
            from invenio.legacy.bibrank.tag_based_indexer import fromDB
            serialized_weights = cache.get('citations_weights')
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB('citation')

            alldicts['citations_weights'] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts['citations_keys'] = intbitset(weights.keys())

            # Citation counts
            alldicts['citations_counts'] = [t for t in iteritems(weights)]
            alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = cache.get('selfcites_weights')
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB('selfcites')
            selfcites_weights = {}
            for recid, counts in alldicts['citations_counts']:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts['selfcites_weights'] = selfcites_weights
            alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']]
            alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True)

            return alldicts
Ejemplo n.º 3
0
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if reclist[len(reclist) - i][0] in term_recs:
                    voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if i in count:
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
Ejemplo n.º 4
0
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (
            i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql(
                """SELECT hitlist FROM %s WHERE term=%%s""" % table, (term, ))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if reclist[len(reclist) - i][0] in term_recs:
                    voutput += "%s-%s / " % (
                        term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if i in count:
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
Ejemplo n.º 5
0
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("""SELECT id from "rnkMETHOD" where name=%s""", (rank_method_code, ))
    res = run_sql("""SELECT relevance_data FROM "rnkMETHODDATA" WHERE "id_rnkMETHOD"=%s""", (id[0][0], ))
    if res:
        return deserialize_via_marshal(res[0][0])
    else:
        return {}
Ejemplo n.º 6
0
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s",
                 (rank_method_code, ))
    res = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s",
        (id[0][0], ))
    if res:
        return deserialize_via_marshal(res[0][0])
    else:
        return {}
Ejemplo n.º 7
0
def get_uid_based_on_pref(prefname, prefvalue):
    """get the user's UID based where his/her preference prefname has value prefvalue in preferences"""
    prefs = run_sql("SELECT id, settings FROM user WHERE settings is not NULL")
    the_uid = None
    for pref in prefs:
        try:
            settings = deserialize_via_marshal(pref[1])
            if (prefname in settings) and (settings[prefname] == prefvalue):
                the_uid = pref[0]
        except:
            pass
    return the_uid
Ejemplo n.º 8
0
def get_uid_based_on_pref(prefname, prefvalue):
    """get the user's UID based where his/her preference prefname has value prefvalue in preferences"""
    prefs = run_sql("""SELECT id, settings FROM "user" WHERE settings is not NULL""")
    the_uid = None
    for pref in prefs:
        try:
            settings = deserialize_via_marshal(pref[1])
            if (prefname in settings) and (settings[prefname] == prefvalue):
                the_uid = pref[0]
        except:
            pass
    return the_uid
def do_upgrade():
    """do upgrade."""
    rows_to_change = run_sql(
        "SELECT id, arguments FROM oaiHARVEST", with_dict=True)
    # Move away from old columns
    for row in rows_to_change:
        if row['arguments']:
            arguments = deserialize_via_marshal(row['arguments'])
            if "c_cfg-file" in arguments:
                arguments['c_stylesheet'] = arguments['c_cfg-file']
                del arguments['c_cfg-file']
                run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                        (serialize_via_marshal(arguments), row['id']))
def do_upgrade():
    """do upgrade."""
    rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST",
                             with_dict=True)
    # Move away from old columns
    for row in rows_to_change:
        if row['arguments']:
            arguments = deserialize_via_marshal(row['arguments'])
            if "c_cfg-file" in arguments:
                arguments['c_stylesheet'] = arguments['c_cfg-file']
                del arguments['c_cfg-file']
                run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                        (serialize_via_marshal(arguments), row['id']))
Ejemplo n.º 11
0
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql('SELECT d.relevance_data \
                          from "rnkMETHODDATA" d, "rnkMETHOD" r WHERE \
                          d."id_rnkMETHOD" = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error as err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
Ejemplo n.º 12
0
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("""SELECT id from "rnkMETHOD" where name=%s""", (rank_method_code, ))
    res = run_sql("""SELECT relevance_data FROM "rnkMETHODDATA" WHERE "id_rnkMETHOD"=%s""", (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if i in rec_dict:
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
Ejemplo n.º 13
0
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql(
            'SELECT d.relevance_data \
                          from rnkMETHODDATA d, rnkMETHOD r WHERE \
                          d.id_rnkMETHOD = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error as err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
Ejemplo n.º 14
0
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s",
                 (rank_method_code, ))
    res = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s",
        (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if i in rec_dict:
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
def do_upgrade():
    """do upgrade."""
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql(
            """SELECT id, bibconvertcfgfile, bibfilterprogram, arguments
               FROM oaiHARVEST""", with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                    (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def do_upgrade():
    """do upgrade."""
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql(
            """SELECT id, bibconvertcfgfile, bibfilterprogram, arguments
               FROM oaiHARVEST""",
            with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                    (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
Ejemplo n.º 17
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,
                    verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[
                rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(
                    re.sub(
                        methods[rank_method_code]
                        ["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, ' ', ''),
                                methods[rank_method_code]["stemmer"])
                if lwords_old[
                        i] != term:  #add if stemmed word is different than original word
                    lwords.append(
                        (term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" %
            methods[rank_method_code]["rnkWORD_table"], (term, ))
        if term_recs:  #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])),
                term_recs,
                hitset,
                recdict,
                rec_termcount,
                verbose,
                quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "", voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount,
                                                  hitset, rank_limit_relevance,
                                                  verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (
            str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 18
0
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,
                   verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    rnkdict = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s",
        (rank_method_code, ))

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." %
                rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(
            0, len(lwords)
    ):  #find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if lword.find("->") > -1:
                lword = lword.split("->")
                if int(lword[0]) >= max_recid or int(
                        lword[1]) >= max_recid + 1:
                    return (None,
                            "Warning: Given record IDs are out of range.", "",
                            voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.",
                        "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(
            rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset:  #rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if recID in rnkdict:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else:  #rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if recID in rnkdict and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(
            reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (reclist_addend + reclist, METHODS[rank_method_code]["prefix"],
            METHODS[rank_method_code]["postfix"], voutput)
Ejemplo n.º 19
0
def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer

    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace("*", "%")
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(" ") or ps[1].startswith(" ")):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and len(pairs_right) > 1 and pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find("%") > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = "xxxxxxxxxx"
        # hopefuly this will not clash with anything in the future
        p = p.replace("%", replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, "%")
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg["CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH"] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [term for term in termlist if term.lower().find(p.lower()) > -1]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()
Ejemplo n.º 20
0
def update_bibsort_tables(recids, method, update_timestamp = True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql("""SELECT id, definition, washer
                  from "bsrMETHOD" where name = %s""", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql("""SELECT data_dict, data_dict_ordered, data_list_sorted
                  FROM "bsrMETHODDATA" where "id_bsrMETHOD" = %s""", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else: # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp)
        except Error as err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
    return True
Ejemplo n.º 21
0
def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer)
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1],
                                              pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [
                        term for term in termlist
                        if term.lower().find(p.lower()) > -1
                ]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()
Ejemplo n.º 22
0
 def ordered(self):
     """Return deserialized orderd dict."""
     return deserialize_via_marshal(self.data_dict_ordered)
Ejemplo n.º 23
0
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance, verbose, methods):
    """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    recID - records to use for find similar
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    startCreate = time.time()
    global voutput
    voutput = ""

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />"
            % rank_method_code
        )
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception as e:
        return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput)

    rec_terms = run_sql(
        """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID,)
    )
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "", voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    # Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(
            run_sql(
                """SELECT term, hitlist FROM %s WHERE term IN (%s)"""
                % (methods[rank_method_code]["rnkWORD_table"], terms[1 : len(terms) - 1])
            )
        )

    tf_values = {}
    # Calculate all term frequencies
    for (term, tf) in iteritems(rec_terms):
        if len(term) >= methods[rank_method_code]["min_word_length"] and term in terms_recs and tf[1] != 0:
            tf_values[term] = int((1 + math.log(tf[0])) * tf[1])  # calculate term weigth
    tf_values = tf_values.items()
    tf_values.sort(lambda x, y: cmp(y[1], x[1]))  # sort based on weigth

    lwords = []
    stime = time.time()
    (recdict, rec_termcount) = ({}, {})

    for (t, tf) in tf_values:  # t=term, tf=term frequency
        term_recs = deserialize_via_marshal(terms_recs[t])
        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (
            len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"]
            and (
                (
                    (float(len(term_recs)) / float(methods[rank_method_code]["col_size"]))
                    <= methods[rank_method_code]["max_word_occurence"]
                )
                and (
                    (float(len(term_recs)) / float(methods[rank_method_code]["col_size"]))
                    >= methods[rank_method_code]["min_word_occurence"]
                )
            )
        ):  # too complicated...something must be done
            lwords.append((t, methods[rank_method_code]["rnkWORD_table"]))  # list of terms used
            (recdict, rec_termcount) = calculate_record_relevance_findsimilar(
                (t, round(tf, 4)), term_recs, hitset, recdict, rec_termcount, verbose, "true"
            )  # true tells the function to not calculate all unimportant terms
        if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and (
            len(lwords) == methods[rank_method_code]["max_nr_words_upper"] or tf < 0
        ):
            break

    if len(recdict) == 0 or len(lwords) == 0:
        return (None, "Could not find similar documents for this query.", "", voutput)
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance_findsimilar(
            recdict, rec_termcount, hitset, rank_limit_relevance, verbose
        )

    if verbose > 0:
        voutput += (
            "<br />Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Number of terms to use for query: %s<br />" % len(lwords)
        voutput += "Terms: %s<br />" % lwords
        voutput += "Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 24
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />"
            % rank_method_code
        )

    lwords_old = lwords
    lwords = []
    # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if (
            not methods[rank_method_code]["stopwords"] == "True"
            or methods[rank_method_code]["stopwords"]
            and not is_stopword(term)
        ):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term))
            )
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term:  # add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    # For each term, if accepted, get a list of the records using the term
    # calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)
        )
        if term_recs:  # if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None
            )
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "",
            voutput,
        )
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    # Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  # using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  # using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += (
            "Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 25
0
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code,))

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if lword.find("->") > -1:
                lword = lword.split("->")
                if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1:
                    return (None, "Warning: Given record IDs are out of range.", "", voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.", "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if recID in rnkdict:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else: #rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if recID in rnkdict and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (reclist_addend + reclist, METHODS[rank_method_code]["prefix"], METHODS[rank_method_code]["postfix"], voutput)
Ejemplo n.º 26
0
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,
                 verbose, methods):
    """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    recID - records to use for find similar
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    startCreate = time.time()
    global voutput
    voutput = ""

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception as e:
        return (
            None,
            "Warning: Error in record ID, please check that a number is given.",
            "", voutput)

    rec_terms = run_sql(
        """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" %
        methods[rank_method_code]["rnkWORD_table"][:-1], (recID, ))
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "",
                voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    #Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (
            None,
            "Warning: Record specified has no content indexed for use with this method.",
            "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(
            run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" %
                    (methods[rank_method_code]["rnkWORD_table"],
                     terms[1:len(terms) - 1])))

    tf_values = {}
    #Calculate all term frequencies
    for (term, tf) in iteritems(rec_terms):
        if len(term) >= methods[rank_method_code][
                "min_word_length"] and term in terms_recs and tf[1] != 0:
            tf_values[term] = int(
                (1 + math.log(tf[0])) * tf[1])  #calculate term weigth
    tf_values = tf_values.items()
    tf_values.sort(lambda x, y: cmp(y[1], x[1]))  #sort based on weigth

    lwords = []
    stime = time.time()
    (recdict, rec_termcount) = ({}, {})

    for (t, tf) in tf_values:  #t=term, tf=term frequency
        term_recs = deserialize_via_marshal(terms_recs[t])
        if len(tf_values
               ) <= methods[rank_method_code]["max_nr_words_lower"] or (
                   len(term_recs) >=
                   methods[rank_method_code]["min_nr_words_docs"] and
                   (((float(len(term_recs)) /
                      float(methods[rank_method_code]["col_size"])) <=
                     methods[rank_method_code]["max_word_occurence"]) and
                    ((float(len(term_recs)) /
                      float(methods[rank_method_code]["col_size"])) >=
                     methods[rank_method_code]["min_word_occurence"]))
               ):  #too complicated...something must be done
            lwords.append((t, methods[rank_method_code]["rnkWORD_table"]
                           ))  #list of terms used
            (recdict, rec_termcount) = calculate_record_relevance_findsimilar(
                (t, round(tf, 4)), term_recs, hitset, recdict, rec_termcount,
                verbose, "true"
            )  #true tells the function to not calculate all unimportant terms
        if len(tf_values
               ) > methods[rank_method_code]["max_nr_words_lower"] and (
                   len(lwords)
                   == methods[rank_method_code]["max_nr_words_upper"]
                   or tf < 0):
            break

    if len(recdict) == 0 or len(lwords) == 0:
        return (None, "Could not find similar documents for this query.", "",
                voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance_findsimilar(
            recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    if verbose > 0:
        voutput += "<br />Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Number of terms to use for query: %s<br />" % len(lwords)
        voutput += "Terms: %s<br />" % lwords
        voutput += "Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 27
0
def update_bibsort_tables(recids, method, update_timestamp=True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql(
        "SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql(
        "SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False  #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else:  # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [
        recid for recid in recids_to_delete if recid in data_dict
    ]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered,
                                      data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert,
                                   recids_old_ordered, method_id,
                                   update_timestamp)
        except Error as err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
    return True