def last_updated_result(rank_method_code):
    """ return the last value of dictionary in rnkMETHODDATA table if it exists and
        initialize the value of last updated records by zero,
        otherwise an initial dictionary with zero as value for all recids
    """
    result = [{}, {}, {}]
    query = """select relevance_data from rnkMETHOD, rnkMETHODDATA where
               rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
               and rnkMETHOD.Name = '%s'""" % rank_method_code
    rdict = run_sql(query)
    if rdict and rdict[0] and rdict[0][0]:
        #has to be prepared for corrupted data!
        try:
            dic = deserialize_via_marshal(rdict[0][0])
        except zlib.error:
            return [{}, {}, {}]
        query = "select object_value from rnkCITATIONDATA where object_name='citationdict'"
        cit_compressed = run_sql(query)
        cit = []
        if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
            cit = deserialize_via_marshal(cit_compressed[0][0])
            if cit:
                query = """select object_value from rnkCITATIONDATA
                           where object_name='reversedict'"""
                ref_compressed = run_sql(query)
                if ref_compressed and ref_compressed[0] and ref_compressed[0][
                        0]:
                    ref = deserialize_via_marshal(ref_compressed[0][0])
                    result = (dic, cit, ref)
    return result
        def fill():
            alldicts = {}
            from invenio.bibrank_tag_based_indexer import fromDB
            redis = get_redis()
            serialized_weights = redis.get('citations_weights')
            if serialized_weights:
                weights = deserialize_via_marshal(serialized_weights)
            else:
                weights = fromDB('citation')

            alldicts['citations_weights'] = weights
            # for cited:M->N queries, it is interesting to cache also
            # some preprocessed citationdict:
            alldicts['citations_keys'] = intbitset(weights.keys())

            # Citation counts
            alldicts['citations_counts'] = [t for t in weights.iteritems()]
            alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True)

            # Self-cites
            serialized_weights = redis.get('selfcites_weights')
            if serialized_weights:
                selfcites = deserialize_via_marshal(serialized_weights)
            else:
                selfcites = fromDB('selfcites')
            selfcites_weights = {}
            for recid, counts in alldicts['citations_counts']:
                selfcites_weights[recid] = counts - selfcites.get(recid, 0)
            alldicts['selfcites_weights'] = selfcites_weights
            alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']]
            alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True)

            return alldicts
def last_updated_result(rank_method_code):
    """ return the last value of dictionary in rnkMETHODDATA table if it exists and
        initialize the value of last updated records by zero,
        otherwise an initial dictionary with zero as value for all recids
    """
    result = [{}, {}, {}]
    query = """select relevance_data from rnkMETHOD, rnkMETHODDATA where
               rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
               and rnkMETHOD.Name = '%s'"""% rank_method_code
    rdict = run_sql(query)
    if rdict and rdict[0] and rdict[0][0]:
        #has to be prepared for corrupted data!
        try:
            dic = deserialize_via_marshal(rdict[0][0])
        except zlib.error:
            return [{}, {}, {}]
        query = "select object_value from rnkCITATIONDATA where object_name='citationdict'"
        cit_compressed = run_sql(query)
        cit = []
        if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
            cit = deserialize_via_marshal(cit_compressed[0][0])
            if cit:
                query = """select object_value from rnkCITATIONDATA
                           where object_name='reversedict'"""
                ref_compressed = run_sql(query)
                if ref_compressed and ref_compressed[0] and ref_compressed[0][0]:
                    ref = deserialize_via_marshal(ref_compressed[0][0])
                    result = (dic, cit, ref)
    return result
 def test_filetypes_of_records(self):
     """tests files extensions of record 1 and 77"""
     query1 = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=1" \
              % get_index_id_from_index_name('filetype')
     query2 = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=77" \
              % get_index_id_from_index_name('filetype')
     res1 = run_sql(query1)
     res2 = run_sql(query2)
     set1 = deserialize_via_marshal(res1[0][0])
     set2 = deserialize_via_marshal(res2[0][0])
     self.assertEqual(set1, ['gif', 'jpg'])
     self.assertEqual(set2, ['pdf', 'ps.gz'])
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if term_recs.has_key(reclist[len(reclist) - i][0]):
                    voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if count.has_key(i):
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
def rank_method_stat(rank_method_code, reclist, lwords):
    """Shows some statistics about the searchresult.
    rank_method_code - name field from rnkMETHOD
    reclist - a list of sorted and ranked records
    lwords - the words in the query"""

    voutput = ""
    if len(reclist) > 20:
        j = 20
    else:
        j = len(reclist)

    voutput += "<br />Rank statistics:<br />"
    for i in range(1, j + 1):
        voutput += "%s,Recid:%s,Score:%s<br />" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1])
        for (term, table) in lwords:
            term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,))
            if term_recs:
                term_recs = deserialize_via_marshal(term_recs[0][0])
                if term_recs.has_key(reclist[len(reclist) - i][0]):
                    voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]])
        voutput += "<br />"

    voutput += "<br />Score variation:<br />"
    count = {}
    for i in range(0, len(reclist)):
        count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1
    i = 100
    while i >= 0:
        if count.has_key(i):
            voutput += "%s-%s<br />" % (i, count[i])
        i -= 1
Exemple #7
0
def get_user_preferences(uid):
    pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid, ))
    if pref:
        try:
            return deserialize_via_marshal(pref[0][1])
        except:
            pass
    return get_default_user_preferences()  # empty dict mean no preferences
 def test_splliting_and_indexing_CJK_characters_reversed_table(self):
     """CJK Tokenizer - comparing terms for record with chinese poetry in title index, reverse table"""
     query = "SELECT * from test_idxWORD%02dR where id_bibrec='104'" % get_index_id_from_index_name('title')
     res = run_sql(query)
     iset = []
     if res:
         iset = deserialize_via_marshal(res[0][1])
     self.assertEqual(iset, ['\xe6\x95\xac', '\xe7\x8d\xa8', '\xe4\xba\xad', '\xe5\x9d\x90'])
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        return deserialize_via_marshal(res[0][0])
    else:
        return {}
def fromDB(rank_method_code):
    """Get the data for a rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        return deserialize_via_marshal(res[0][0])
    else:
        return {}
Exemple #11
0
def get_user_preferences(uid):
    pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid,))
    if pref:
        try:
            return deserialize_via_marshal(pref[0][1])
        except:
            pass
    return get_default_user_preferences() # empty dict mean no preferences
Exemple #12
0
 def process_result_value(self, value, dialect):
     if value is not None:
         try:
             value = deserialize_via_marshal(value)
         except:
             value = None
     return value if value is not None else \
         (self.default_value() if callable(self.default_value) else
          self.default_value)
def get_cit_dict(name):
    """get a named citation dict from the db"""
    cdict = run_sql("""SELECT object_value FROM rnkCITATIONDATA
                       WHERE object_name = %s""", (name, ))

    if cdict and cdict[0] and cdict[0][0]:
        dict_from_db = deserialize_via_marshal(cdict[0][0])
    else:
        dict_from_db = {}

    return dict_from_db
def do_upgrade():
    rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST", with_dict=True)
    # Move away from old columns
    for row in rows_to_change:
        if row['arguments']:
            arguments = deserialize_via_marshal(row['arguments'])
            if "c_cfg-file" in arguments:
                arguments['c_stylesheet'] = arguments['c_cfg-file']
                del arguments['c_cfg-file']
                run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                        (serialize_via_marshal(arguments), row['id']))
def get_initial_author_dict():
    """read author->citedinlist dict from the db"""
    adict = {}
    try:
        ah = run_sql("select aterm,hitlist from rnkAUTHORDATA")
        for (a, h) in ah:
            adict[a] = deserialize_via_marshal(h)
        return adict
    except:
        register_exception(prefix="could not read rnkAUTHORDATA", alert_admin=True)
        return {}
Exemple #16
0
def get_initial_author_dict():
    """read author->citedinlist dict from the db"""
    adict = {}
    try:
        ah = run_sql("select aterm,hitlist from rnkAUTHORDATA")
        for (a, h) in ah:
            adict[a] = deserialize_via_marshal(h)
        return adict
    except:
        register_exception(prefix="could not read rnkAUTHORDATA", alert_admin=True)
        return {}
def get_cit_dict(name):
    """get a named citation dict from the db"""
    cdict = run_sql("""SELECT object_value FROM rnkCITATIONDATA
                       WHERE object_name = %s""", (name, ))

    if cdict and cdict[0] and cdict[0][0]:
        dict_from_db = deserialize_via_marshal(cdict[0][0])
    else:
        dict_from_db = {}

    return dict_from_db
Exemple #18
0
def do_upgrade():
    rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST",
                             with_dict=True)
    # Move away from old columns
    for row in rows_to_change:
        if row['arguments']:
            arguments = deserialize_via_marshal(row['arguments'])
            if "c_cfg-file" in arguments:
                arguments['c_stylesheet'] = arguments['c_cfg-file']
                del arguments['c_cfg-file']
                run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                        (serialize_via_marshal(arguments), row['id']))
Exemple #19
0
def get_uid_based_on_pref(prefname, prefvalue):
    """get the user's UID based where his/her preference prefname has value prefvalue in preferences"""
    prefs = run_sql("SELECT id, settings FROM user WHERE settings is not NULL")
    the_uid = None
    for pref in prefs:
        try:
            settings = deserialize_via_marshal(pref[1])
            if (settings.has_key(prefname)) and (settings[prefname] == prefvalue):
                the_uid = pref[0]
        except:
            pass
    return the_uid
Exemple #20
0
def get_uid_based_on_pref(prefname, prefvalue):
    """get the user's UID based where his/her preference prefname has value prefvalue in preferences"""
    prefs = run_sql("SELECT id, settings FROM user WHERE settings is not NULL")
    the_uid = None
    for pref in prefs:
        try:
            settings = deserialize_via_marshal(pref[1])
            if (settings.has_key(prefname)) and (settings[prefname]
                                                 == prefvalue):
                the_uid = pref[0]
        except:
            pass
    return the_uid
def get_author_cited_by(authorstring):
    """Return a list of doc ids [y1,y2,..] for the
       author given as param, such that y1,y2.. cite that author
    """
    citations = []
    res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s", (authorstring,))
    if res and res[0] and res[0][0]:
        # has to be prepared for corrupted data!
        try:
            citations = deserialize_via_marshal(res[0][0])
        except:
            citations = []
    return citations
def get_cit_dict(name):
    """get a named citation dict from the db"""
    cdict = {}
    try:
        cdict = run_sql("select object_value from rnkCITATIONDATA where object_name = %s",
                       (name,))
        if cdict and cdict[0] and cdict[0][0]:
            dict_from_db = deserialize_via_marshal(cdict[0][0])
            return dict_from_db
        else:
            return {}
    except:
        register_exception(prefix="could not read "+name+" from db", alert_admin=True)
    return dict
Exemple #23
0
def get_cit_dict(name):
    """get a named citation dict from the db"""
    cdict = {}
    try:
        cdict = run_sql("select object_value from rnkCITATIONDATA where object_name = %s",
                       (name,))
        if cdict and cdict[0] and cdict[0][0]:
            dict_from_db = deserialize_via_marshal(cdict[0][0])
            return dict_from_db
        else:
            return {}
    except:
        register_exception(prefix="could not read "+name+" from db", alert_admin=True)
    return dict
def get_author_cited_by(authorstring):
    """Return a list of doc ids [y1,y2,..] for the
       author given as param, such that y1,y2.. cite that author
    """
    citations = []
    res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s",
                  (authorstring, ))
    if res and res[0] and res[0][0]:
        #has to be prepared for corrupted data!
        try:
            citations = deserialize_via_marshal(res[0][0])
        except:
            citations = []
    return citations
    def test_authority_record_enriched_index(self):
        """bibindex - test whether reverse index for bibliographic record
        contains words from referenced authority records"""
        bibRecID = 9
        authority_string = 'jonathan'
        index_name = 'author'
        table = "idxWORD%02dR" % get_index_id_from_index_name(index_name)

        reindex_for_type_with_bibsched(index_name)
        self.assertTrue(
            authority_string in deserialize_via_marshal(
                run_sql("SELECT termlist FROM %s WHERE id_bibrec = %s" % (table, bibRecID))[0][0]
            )
        )
Exemple #26
0
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if rec_dict.has_key(i):
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
def del_recids(rank_method_code, range_rec):
    """Delete some records from the rank method"""
    id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
    res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
    if res:
        rec_dict = deserialize_via_marshal(res[0][0])
        write_message("Old size: %s" % len(rec_dict))
        for (recids, recide) in range_rec:
            for i in range(int(recids), int(recide)):
                if rec_dict.has_key(i):
                    del rec_dict[i]
        write_message("New size: %s" % len(rec_dict))
        intoDB(rec_dict, begin_date, rank_method_code)
    else:
        write_message("Create before deleting!")
Exemple #28
0
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql('SELECT d.relevance_data \
                          from rnkMETHODDATA d, rnkMETHOD r WHERE \
                          d.id_rnkMETHOD = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error, err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
def get_data_for_definition_rnk(method_name, rnk_name):
    '''Returns the dictionary with data for method_name ranking method'''
    try:
        res = run_sql('SELECT d.relevance_data \
                          from rnkMETHODDATA d, rnkMETHOD r WHERE \
                          d.id_rnkMETHOD = r.id AND \
                          r.name = %s', (rnk_name, ))
        if res and res[0]:
            write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                          %method_name, verbose=5)
            return deserialize_via_marshal(res[0][0])
    except Error, err:
        write_message("No data could be found for sorting method %s. " \
                      "The following errror occured: [%s]" \
                      %(method_name, err), stream=sys.stderr)
        return {}
def last_updated_result(rank_method_code):
    """ return the last value of dictionary in rnkMETHODDATA table if it
        exists and initialize the value of last updated records by zero,
        otherwise an initial dictionary with zero as value for all recids
    """
    query = """SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE
               rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
               AND rnkMETHOD.Name = '%s'""" % rank_method_code

    try:
        rdict = run_sql(query)[0][0]
    except IndexError:
        dic = {}
    else:
        dic = deserialize_via_marshal(rdict)

    return dic
def last_updated_result(rank_method_code):
    """ return the last value of dictionary in rnkMETHODDATA table if it
        exists and initialize the value of last updated records by zero,
        otherwise an initial dictionary with zero as value for all recids
    """
    query = """SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE
               rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
               AND rnkMETHOD.Name = '%s'""" % rank_method_code

    try:
        rdict = run_sql(query)[0][0]
    except IndexError:
        dic = {}
    else:
        dic = deserialize_via_marshal(rdict)

    return dic
def do_upgrade():
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql("SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST", with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def get_citations_from_db():
    """gets the citation data (who cites who) from the rnkCITATIONDATA table,
    and returns:
    -a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
    -a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
    dict_of_ids = {}
    count = 0
    query = "select object_value from rnkCITATIONDATA \
                where object_name = 'citationdict'"

    cit_compressed = run_sql(query)
    cit = []
    if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
        cit = deserialize_via_marshal(cit_compressed[0][0])
        if cit:
            for item in cit:
                #check for duplicates in citation dictionary
                cit[item] = set(cit[item])
                if item in cit[item]:
                    cit[item].remove(item)
                if item not in dict_of_ids:
                    dict_of_ids[item] = count
                    count += 1
                for value in cit[item]:
                    if value not in dict_of_ids:
                        dict_of_ids[value] = count
                        count += 1
            write_message("Citation data collected\
from rnkCITATIONDATA",
                          verbose=2)
            write_message("Ids and recids corespondace: %s" \
                % str(dict_of_ids), verbose=9)
            write_message("Citations: %s" % str(cit), verbose=9)
            return cit, dict_of_ids
        else:
            write_message("Error while extracting citation data \
from rnkCITATIONDATA table",
                          verbose=1)
    else:
        write_message("Error while extracting citation data \
from rnkCITATIONDATA table",
                      verbose=1)
    return {}, {}
Exemple #34
0
 def cache_filler():
     alldicts = {}
     try:
         res = run_sql("SELECT object_name,object_value FROM rnkCITATIONDATA")
     except OperationalError:
         # database problems, return empty cache
         return {}
     for row in res:
         object_name = row[0]
         object_value = row[1]
         try:
             object_value_dict = deserialize_via_marshal(object_value)
         except:
             object_value_dict = {}
         alldicts[object_name] = object_value_dict
         if object_name == 'citationdict':
             # for cited:M->N queries, it is interesting to cache also
             # some preprocessed citationdict:
             alldicts['citationdict_keys'] = object_value_dict.keys()
             alldicts['citationdict_keys_intbitset'] = intbitset(object_value_dict.keys())
     return alldicts
 def cache_filler():
     alldicts = {}
     try:
         res = run_sql("SELECT object_name,object_value FROM rnkCITATIONDATA")
     except OperationalError:
         # database problems, return empty cache
         return {}
     for row in res:
         object_name = row[0]
         object_value = row[1]
         try:
             object_value_dict = deserialize_via_marshal(object_value)
         except:
             object_value_dict = {}
         alldicts[object_name] = object_value_dict
         if object_name == 'citationdict':
             # for cited:M->N queries, it is interesting to cache also
             # some preprocessed citationdict:
             alldicts['citationdict_keys'] = object_value_dict.keys()
             alldicts['citationdict_keys_intbitset'] = intbitset(object_value_dict.keys())
     return alldicts
def do_upgrade():
    create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1]
    if '`arguments` text' in create_statement:
        run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob")
    # translate old values
    if '`bibconvertcfgfile`' in create_statement:
        rows_to_change = run_sql(
            "SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST",
            with_dict=True)
        # Move away from old columns
        for row in rows_to_change:
            if row['arguments']:
                arguments = deserialize_via_marshal(row['arguments'])
            else:
                arguments = {}
            arguments['c_cfg-file'] = row['bibconvertcfgfile']
            arguments['f_filter-file'] = row['bibfilterprogram']
            run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s",
                    (serialize_via_marshal(arguments), row['id']))
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile")
        run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def get_citations_from_db():
    """gets the citation data (who cites who) from the rnkCITATIONDATA table,
    and returns:
    -a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
    -a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
    dict_of_ids = {}
    count = 0
    query = "select object_value from rnkCITATIONDATA \
                where object_name = 'citationdict'"
    cit_compressed = run_sql(query)
    cit = []
    if cit_compressed and cit_compressed[0] and cit_compressed[0][0]:
        cit = deserialize_via_marshal(cit_compressed[0][0])
        if cit:
            for item in cit:
                #check for duplicates in citation dictionary
                cit[item] = set(cit[item])
                if item in cit[item]:
                    cit[item].remove(item)
                if item not in dict_of_ids:
                    dict_of_ids[item] = count
                    count += 1
                for value in cit[item]:
                    if value not in dict_of_ids:
                        dict_of_ids[value] = count
                        count += 1
            write_message("Citation data collected\
from rnkCITATIONDATA", verbose=2)
            write_message("Ids and recids corespondace: %s" \
                % str(dict_of_ids), verbose=9)
            write_message("Citations: %s" % str(cit), verbose=9)
            return cit, dict_of_ids
        else:
            write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
    else:
        write_message("Error while extracting citation data \
from rnkCITATIONDATA table", verbose=1)
    return {}, {}
Exemple #38
0
def _get_users_invenio12(*args, **kwargs):
    """Get user accounts Invenio 1."""
    from invenio.dbquery import run_sql, deserialize_via_marshal
    User = namedtuple('User', [
        'id', 'email', 'password', 'password_salt', 'note', 'full_name',
        'settings', 'nickname', 'last_login'
    ])
    users = run_sql(
        'SELECT id, email, password, note, settings, nickname, last_login'
        ' FROM user',
        run_on_slave=True)
    return len(users), [
        User(
            id=user[0],
            email=user[1],
            password=user[2].decode('latin1'),
            password_salt=user[1],
            note=user[3],
            full_name=user[5],
            settings=deserialize_via_marshal(user[4]) if user[4] else {},
            # we don't have proper nicknames on Invenio v1
            nickname='id_{0}'.format(user[0]),
            last_login=user[6]) for user in users
    ]
def _get_users_invenio12(*args, **kwargs):
    """Get user accounts Invenio 1."""
    from invenio.dbquery import run_sql, deserialize_via_marshal
    User = namedtuple('User', [
        'id', 'email', 'password', 'password_salt', 'note', 'full_name',
        'settings', 'nickname', 'last_login'
    ])
    users = run_sql(
        'SELECT id, email, password, note, settings, nickname, last_login'
        ' FROM user',
        run_on_slave=True)
    return len(users), [
        User(
            id=user[0],
            email=user[1],
            password=user[2].decode('latin1'),
            password_salt=user[1],
            note=user[3],
            full_name=user[5],
            settings=deserialize_via_marshal(user[4]) if user[4] else {},
            # we don't have proper nicknames on Invenio v1
            nickname='id_{0}'.format(user[0]),
            last_login=user[6]) for user in users
    ]
 def test_records_for_number_of_copies_record32(self):
     """checks content of itemcount index for record: 32"""
     query = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=32" \
              % get_index_id_from_index_name('itemcount')
     res = run_sql(query)
     self.assertEqual(deserialize_via_marshal(res[0][0]),['3'])
def update_bibsort_tables(recids, method, update_timestamp = True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql("SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql("SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else: # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp)
        except Error, err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
for p in ptrs:
    pdata[p] = {}
    pdata[p]['tickets'] = run_sql("select * from aidPERSONID where tag like 'rt%%' and personid = %s", (p,))
    pdata[p]['cid'] = run_sql("select data from aidPERSONID where tag = 'canonical_name' and personid = %s", (p,))

sm = serialize_via_marshal(pdata)
fp = open('pdata2.dat', 'w')
fp.write(sm)
fp.close()


### THEN ON TEST
from invenio.dbquery import run_sql, deserialize_via_marshal, serialize_via_marshal
fp = open('pdata.dat', 'r')
dm = deserialize_via_marshal(fp.read())
fp.close()
for d in dm:
    dm[d]['tickets'] = list(dm[d]['tickets'])
    for i, t in enumerate(dm[d]['tickets']):
        dm[d]['tickets'][i] = list(t)

for pp in dm:
    cname = dm[pp]['cid'][0][0]
    dbs = run_sql('select personid from aidPERSONID where tag = "canonical_name" and data = %s', (cname,))
    pid = dbs[0][0]
    dbp = run_sql("select data from aidPERSONID where tag = 'paper' and personid = %s", (pid,))
    refs = [j[3] for j in dm[pp]['tickets'] if j[2]=='rt_confirm' or j[2]=='rt_repeal']
    common_refs = set(refs).intersection(set([i[0] for i in dbp]))
    for t in dm[pp]['tickets']:
        if t[3] in common_refs or not t[2] in ('rt_confirm', 'rt_repeal'):
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,
                   verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    global voutput
    voutput = ""
    rnkdict = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s",
        (rank_method_code, ))

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." %
                rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(
            0, len(lwords)
    ):  #find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if string.find(lword, "->") > -1:
                lword = string.split(lword, "->")
                if int(lword[0]) >= max_recid or int(
                        lword[1]) >= max_recid + 1:
                    return (None,
                            "Warning: Given record IDs are out of range.", "",
                            voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.",
                        "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(
            rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset:  #rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if rnkdict.has_key(recID):
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else:  #rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if rnkdict.has_key(recID) and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(
            reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (reclist_addend + reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose):
    """Ranking of records based on predetermined values.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from
    rnkMETHODDATA
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    rnkdict = run_sql(
        "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s",
        (rank_method_code,),
    )

    if not rnkdict:
        return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput)

    max_recid = 0
    res = run_sql("SELECT max(id) FROM bibrec")
    if res and res[0][0]:
        max_recid = int(res[0][0])

    lwords_hitset = None
    for j in range(0, len(lwords)):  # find which docs to search based on ranges..should be done in search_engine...
        if lwords[j] and lwords[j][:6] == "recid:":
            if not lwords_hitset:
                lwords_hitset = intbitset()
            lword = lwords[j][6:]
            if lword.find("->") > -1:
                lword = lword.split("->")
                if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1:
                    return (None, "Warning: Given record IDs are out of range.", "", voutput)
                for i in range(int(lword[0]), int(lword[1])):
                    lwords_hitset.add(int(i))
            elif lword < max_recid + 1:
                lwords_hitset.add(int(lword))
            else:
                return (None, "Warning: Given record IDs are out of range.", "", voutput)

    rnkdict = deserialize_via_marshal(rnkdict[0][0])
    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />"
            % rank_method_code
        )
        voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict)
    lrecIDs = list(hitset)

    if verbose > 0:
        voutput += "Number of records to rank: %s<br />" % len(lrecIDs)
    reclist = []
    reclist_addend = []

    if not lwords_hitset:  # rank all docs, can this be speed up using something else than for loop?
        for recID in lrecIDs:
            if recID in rnkdict:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            else:
                reclist_addend.append((recID, 0))
    else:  # rank docs in hitset, can this be speed up using something else than for loop?
        for recID in lwords_hitset:
            if recID in rnkdict and recID in hitset:
                reclist.append((recID, rnkdict[recID]))
                del rnkdict[recID]
            elif recID in hitset:
                reclist_addend.append((recID, 0))

    if verbose > 0:
        voutput += "Number of records ranked: %s<br />" % len(reclist)
        voutput += "Number of records not ranked: %s<br />" % len(reclist_addend)

    reclist.sort(lambda x, y: cmp(x[1], y[1]))
    return (
        reclist_addend + reclist,
        METHODS[rank_method_code]["prefix"],
        METHODS[rank_method_code]["postfix"],
        voutput,
    )
    global voutput
    voutput = ""

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception,e :
        return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput)

    rec_terms = run_sql("""SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1],  (recID,))
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "", voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    #Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1])))

    tf_values = {}
    #Calculate all term frequencies
    for (term, tf) in rec_terms.iteritems():
        if len(term) >= methods[rank_method_code]["min_word_length"] and terms_recs.has_key(term) and tf[1] != 0:
            tf_values[term] =  int((1 + math.log(tf[0])) * tf[1]) #calculate term weigth
    tf_values = tf_values.items()
    tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />"
            % rank_method_code
        )

    lwords_old = lwords
    lwords = []
    # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if (
            not methods[rank_method_code]["stopwords"] == "True"
            or methods[rank_method_code]["stopwords"]
            and not is_stopword(term, 1)
        ):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term))
            )
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"):  # stem word
                    term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term:  # add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    # For each term, if accepted, get a list of the records using the term
    # calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)
        )
        if term_recs:  # if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None
            )
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "",
            voutput,
        )
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    # Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  # using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  # using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += (
            "Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
            "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />"
            % rank_method_code
        )
    rank_limit_relevance = methods[rank_method_code]["default_min_relevance"]

    try:
        recID = int(recID)
    except Exception, e:
        return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput)

    rec_terms = run_sql(
        """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID,)
    )
    if not rec_terms:
        return (None, "Warning: Requested record does not seem to exist.", "", voutput)
    rec_terms = deserialize_via_marshal(rec_terms[0][0])

    # Get all documents using terms from the selected documents
    if len(rec_terms) == 0:
        return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput)
    else:
        terms = "%s" % rec_terms.keys()
        terms_recs = dict(
            run_sql(
                """SELECT term, hitlist FROM %s WHERE term IN (%s)"""
                % (methods[rank_method_code]["rnkWORD_table"], terms[1 : len(terms) - 1])
            )
        )

    tf_values = {}
    # Calculate all term frequencies
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"): # stem word
                    term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term: #add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,))
        if term_recs: #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
    else: #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)                       #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Exemple #49
0
def update_bibsort_tables(recids, method, update_timestamp=True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql(
        "SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql(
        "SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False  #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else:  # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [
        recid for recid in recids_to_delete if recid in data_dict
    ]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered,
                                      data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert,
                                   recids_old_ordered, method_id,
                                   update_timestamp)
        except Error, err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False