Ejemplo n.º 1
0
        def fill():
            alldicts = {}
            from invenio.legacy.bibrank.tag_based_indexer import fromDB
            serialized_weights = cache.get('citations_weights')
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB('citation')

            alldicts['citations_weights'] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts['citations_keys'] = intbitset(weights.keys())

            # Citation counts
            alldicts['citations_counts'] = [t for t in iteritems(weights)]
            alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = cache.get('selfcites_weights')
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB('selfcites')
            selfcites_weights = {}
            for recid, counts in alldicts['citations_counts']:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts['selfcites_weights'] = selfcites_weights
            alldicts['selfcites_counts'] = [
                (recid, selfcites_weights.get(recid, cites))
                for recid, cites in alldicts['citations_counts']
            ]
            alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True)

            return alldicts
Ejemplo n.º 2
0
        def fill():
            alldicts = {}
            from invenio.legacy.bibrank.tag_based_indexer import fromDB
            redis = get_redis()
            serialized_weights = redis.get('citations_weights')
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB('citation')

            alldicts['citations_weights'] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts['citations_keys'] = intbitset(weights.keys())

            # Citation counts
            alldicts['citations_counts'] = [t for t in iteritems(weights)]
            alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = redis.get('selfcites_weights')
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB('selfcites')
            selfcites_weights = {}
            for recid, counts in alldicts['citations_counts']:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts['selfcites_weights'] = selfcites_weights
            alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']]
            alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True)

            return alldicts
Ejemplo n.º 3
0
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if reclist[len(reclist) - i][0] in term_recs:
                    voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if i in count:
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
Ejemplo n.º 4
0
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (
            i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql(
                """SELECT hitlist FROM %s WHERE term=%%s""" % table, (term, ))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if reclist[len(reclist) - i][0] in term_recs:
                    voutput += "%s-%s / " % (
                        term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if i in count:
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
Ejemplo n.º 5
0
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        return deserialize_via_marshal(res[0][0])
    else:
        return {}
Ejemplo n.º 6
0
def get_cit_dict(name):
    """get a named citation dict from the db"""
    cdict = run_sql("""SELECT object_value FROM rnkCITATIONDATA
                       WHERE object_name = %s""", (name, ))

    if cdict and cdict[0] and cdict[0][0]:
        dict_from_db = deserialize_via_marshal(cdict[0][0])
    else:
        dict_from_db = {}

    return dict_from_db
Ejemplo n.º 7
0
def do_upgrade():
    rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST", with_dict=True)
    # Move away from old columns
    for row in rows_to_change:
        if row['arguments']:
            arguments = deserialize_via_marshal(row['arguments'])
            if "c_cfg-file" in arguments:
                arguments['c_stylesheet'] = arguments['c_cfg-file']
                del arguments['c_cfg-file']
                run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                        (serialize_via_marshal(arguments), row['id']))
Ejemplo n.º 8
0
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s",
                 (rank_method_code, ))
    if id:
        res = run_sql(
            "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s",
            (id[0][0], ))
        if res:
            return deserialize_via_marshal(res[0][0])
    return {}
Ejemplo n.º 9
0
def get_initial_author_dict():
    """read author->citedinlist dict from the db"""
    adict = {}
    try:
        ah = run_sql("SELECT aterm,hitlist FROM rnkAUTHORDATA")
        for (a, h) in ah:
            adict[a] = deserialize_via_marshal(h)
        return adict
    except:
        register_exception(prefix="could not read rnkAUTHORDATA",
                           alert_admin=True)
        return {}
Ejemplo n.º 10
0
def get_author_cited_by(authorstring):
    """Return a list of doc ids [y1,y2,..] for the
       author given as param, such that y1,y2.. cite that author
    """
    citations = []
    res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s",
                  (authorstring,))
    if res and res[0] and res[0][0]:
        #has to be prepared for corrupted data!
        try:
            citations = deserialize_via_marshal(res[0][0])
        except:
            citations = []
    return citations
Ejemplo n.º 11
0
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql('SELECT d.relevance_data \
                          from rnkMETHODDATA d, rnkMETHOD r WHERE \
                          d.id_rnkMETHOD = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error as err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
Ejemplo n.º 12
0
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if i in rec_dict:
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
Ejemplo n.º 13
0
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql(
            'SELECT d.relevance_data \
                          from rnkMETHODDATA d, rnkMETHOD r WHERE \
                          d.id_rnkMETHOD = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error as err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
Ejemplo n.º 14
0
def last_updated_result(rank_method_code):
    """ return the last value of dictionary in rnkMETHODDATA table if it
        exists and initialize the value of last updated records by zero,
        otherwise an initial dictionary with zero as value for all recids
    """
    query = """SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE
               rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
               AND rnkMETHOD.Name = '%s'""" % rank_method_code

    try:
        rdict = run_sql(query)[0][0]
    except IndexError:
        dic = {}
    else:
        dic = deserialize_via_marshal(rdict)

    return dic
def do_upgrade():
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql("SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST", with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
Ejemplo n.º 16
0
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s",
                 (rank_method_code, ))
    res = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s",
        (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if i in rec_dict:
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
Ejemplo n.º 17
0
def do_upgrade():
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql(
            "SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST",
            with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                    (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
Ejemplo n.º 18
0
 def cache_filler():
     alldicts = {}
     try:
         res = run_sql("SELECT object_name,object_value FROM rnkCITATIONDATA")
     except OperationalError:
         # database problems, return empty cache
         return {}
     for row in res:
         object_name = row[0]
         object_value = row[1]
         try:
             object_value_dict = deserialize_via_marshal(object_value)
         except:
             object_value_dict = {}
         alldicts[object_name] = object_value_dict
         if object_name == 'citationdict':
             # for cited:M->N queries, it is interesting to cache also
             # some preprocessed citationdict:
             alldicts['citationdict_keys'] = object_value_dict.keys()
             alldicts['citationdict_keys_intbitset'] = intbitset(object_value_dict.keys())
     return alldicts
Ejemplo n.º 19
0
def update_bibsort_tables(recids, method, update_timestamp=True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql(
        "SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql(
        "SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False  #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else:  # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [
        recid for recid in recids_to_delete if recid in data_dict
    ]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered,
                                      data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert,
                                   recids_old_ordered, method_id,
                                   update_timestamp)
        except Error as err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
    return True
Ejemplo n.º 20
0
def update_bibsort_tables(recids, method, update_timestamp = True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql("SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql("SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else: # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp)
        except Error as err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
    return True
Ejemplo n.º 21
0
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code,))

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if lword.find("->") > -1:
                lword = lword.split("->")
                if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1:
                    return (None, "Warning: Given record IDs are out of range.", "", voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.", "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if recID in rnkdict:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else: #rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if recID in rnkdict and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (reclist_addend + reclist, METHODS[rank_method_code]["prefix"], METHODS[rank_method_code]["postfix"], voutput)
Ejemplo n.º 22
0
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,verbose, methods):
    """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    recID - records to use for find similar
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    startCreate = time.time()
    global voutput
    voutput = ""

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception as e :
        return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput)

    rec_terms = run_sql("""SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1],  (recID,))
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "", voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    #Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1])))

    tf_values = {}
    #Calculate all term frequencies
    for (term, tf) in iteritems(rec_terms):
        if len(term) >= methods[rank_method_code]["min_word_length"] and term in terms_recs and tf[1] != 0:
            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1]) #calculate term weigth
    tf_values = tf_values.items()
    tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth

    lwords = []
    stime = time.time()
    (recdict, rec_termcount) = ({}, {})

    for (t, tf) in tf_values: #t=term, tf=term frequency
        term_recs = deserialize_via_marshal(terms_recs[t])
        if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or (len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <=  methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"]))): #too complicated...something must be done
            lwords.append((t, methods[rank_method_code]["rnkWORD_table"])) #list of terms used
            (recdict, rec_termcount) = calculate_record_relevance_findsimilar((t, round(tf, 4)) , term_recs, hitset, recdict, rec_termcount, verbose, "true") #true tells the function to not calculate all unimportant terms
        if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and (len(lwords) ==  methods[rank_method_code]["max_nr_words_upper"] or tf < 0):
            break

    if len(recdict) == 0 or len(lwords) == 0:
        return (None, "Could not find similar documents for this query.", "", voutput)
    else: #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance_findsimilar(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    if verbose > 0:
        voutput += "<br />Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Number of terms to use for query: %s<br />" % len(lwords)
        voutput += "Terms: %s<br />" % lwords
        voutput += "Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 23
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,
                    verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[
                rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(
                    re.sub(
                        methods[rank_method_code]
                        ["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, ' ', ''),
                                methods[rank_method_code]["stemmer"])
                if lwords_old[
                        i] != term:  #add if stemmed word is different than original word
                    lwords.append(
                        (term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" %
            methods[rank_method_code]["rnkWORD_table"], (term, ))
        if term_recs:  #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])),
                term_recs,
                hitset,
                recdict,
                rec_termcount,
                verbose,
                quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "", voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount,
                                                  hitset, rank_limit_relevance,
                                                  verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (
            str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 24
0
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,
                 verbose, methods):
    """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance,
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    recID - records to use for find similar
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    startCreate = time.time()
    global voutput
    voutput = ""

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception as e:
        return (
            None,
            "Warning: Error in record ID, please check that a number is given.",
            "", voutput)

    rec_terms = run_sql(
        """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" %
        methods[rank_method_code]["rnkWORD_table"][:-1], (recID, ))
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "",
                voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    #Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (
            None,
            "Warning: Record specified has no content indexed for use with this method.",
            "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(
            run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" %
                    (methods[rank_method_code]["rnkWORD_table"],
                     terms[1:len(terms) - 1])))

    tf_values = {}
    #Calculate all term frequencies
    for (term, tf) in iteritems(rec_terms):
        if len(term) >= methods[rank_method_code][
                "min_word_length"] and term in terms_recs and tf[1] != 0:
            tf_values[term] = int(
                (1 + math.log(tf[0])) * tf[1])  #calculate term weigth
    tf_values = tf_values.items()
    tf_values.sort(lambda x, y: cmp(y[1], x[1]))  #sort based on weigth

    lwords = []
    stime = time.time()
    (recdict, rec_termcount) = ({}, {})

    for (t, tf) in tf_values:  #t=term, tf=term frequency
        term_recs = deserialize_via_marshal(terms_recs[t])
        if len(tf_values
               ) <= methods[rank_method_code]["max_nr_words_lower"] or (
                   len(term_recs) >=
                   methods[rank_method_code]["min_nr_words_docs"] and
                   (((float(len(term_recs)) /
                      float(methods[rank_method_code]["col_size"])) <=
                     methods[rank_method_code]["max_word_occurence"]) and
                    ((float(len(term_recs)) /
                      float(methods[rank_method_code]["col_size"])) >=
                     methods[rank_method_code]["min_word_occurence"]))
               ):  #too complicated...something must be done
            lwords.append((t, methods[rank_method_code]["rnkWORD_table"]
                           ))  #list of terms used
            (recdict, rec_termcount) = calculate_record_relevance_findsimilar(
                (t, round(tf, 4)), term_recs, hitset, recdict, rec_termcount,
                verbose, "true"
            )  #true tells the function to not calculate all unimportant terms
        if len(tf_values
               ) > methods[rank_method_code]["max_nr_words_lower"] and (
                   len(lwords)
                   == methods[rank_method_code]["max_nr_words_upper"]
                   or tf < 0):
            break

    if len(recdict) == 0 or len(lwords) == 0:
        return (None, "Could not find similar documents for this query.", "",
                voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance_findsimilar(
            recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    if verbose > 0:
        voutput += "<br />Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Number of terms to use for query: %s<br />" % len(lwords)
        voutput += "Terms: %s<br />" % lwords
        voutput += "Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 25
0
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    global voutput
    voutput = ""
    rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code,))

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if string.find(lword, "->") > -1:
                lword = string.split(lword, "->")
                if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1:
                    return (None, "Warning: Given record IDs are out of range.", "", voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.", "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if recID in rnkdict:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else: #rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if recID in rnkdict and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (reclist_addend + reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Ejemplo n.º 26
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if "stemmer" in methods[rank_method_code]: # stem word
                    term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term: #add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,))
        if term_recs: #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
    else: #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)                       #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)