Python HitSet Examples, invenio.search_engine.HitSet Python Examples

Example #1

0

Show file

File: websearch_webcoll.py Project: metandrey/invenio-metandrey

 def __init__(self, name=""):
     "Creates collection instance by querying the DB configuration database about 'name'."
     self.calculate_reclist_run_already = 0 # to speed things up without much refactoring
     self.update_reclist_run_already = 0 # to speed things up without much refactoring
     self.reclist_with_nonpublic_subcolls = HitSet()
     # used to store the temporary result of the calculation of nbrecs of an external collection
     self.nbrecs_tmp = None
     if not name:
         self.name = CFG_SITE_NAME # by default we are working on the home page
         self.id = 1
         self.dbquery = None
         self.nbrecs = None
         self.reclist = HitSet()
     else:
         self.name = name
         try:
             res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection
                               WHERE name=%s""", (name,))
             if res:
                 self.id = res[0][0]
                 self.name = res[0][1]
                 self.dbquery = res[0][2]
                 self.nbrecs = res[0][3]
                 try:
                     self.reclist = HitSet(res[0][4])
                 except:
                     self.reclist = HitSet()
             else: # collection does not exist!
                 self.id = None
                 self.dbquery = None
                 self.nbrecs = None
                 self.reclist = HitSet()
         except Error, e:
             print "Error %d: %s" % (e.args[0], e.args[1])
             sys.exit(1)

Example #2

0

Show file

File: bibrank_record_sorter_tests.py Project: pombredanne/invenio

 def test_record_sorter(self):
     """bibrank record sorter - sorting records"""
     hitset = HitSet()
     hitset += (1,2,5)
     hitset2 = HitSet()
     hitset2.add(5)
     rec_termcount = {1: 1, 2: 1, 5: 1}
     (res1, res2) = bibrank_record_sorter.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0)
     self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2)))

Example #3

0

Show file

File: bibrank_tag_based_indexer.py Project: lbjay/cds-invenio

def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))
    input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r')
    data = input.readlines()
    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = HitSet(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = HitSet()
            newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if kb_data.has_key(value):
            if not rnkset.has_key(key):
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset

Example #4

0

Show file

 def test_record_sorter(self):
     """bibrank record sorter - sorting records"""
     hitset = HitSet()
     hitset += (1, 2, 5)
     hitset2 = HitSet()
     hitset2.add(5)
     rec_termcount = {1: 1, 2: 1, 5: 1}
     (res1, res2) = bibrank_record_sorter.sort_record_relevance(
         {
             1: 50,
             2: 30,
             3: 70,
             4: 10
         }, rec_termcount, hitset, 50, 0)
     self.assertEqual(([(1, 71), (3, 100)], list(hitset2)),
                      (res1, list(res2)))

Example #5

0

Show file

File: bibrank_tag_based_indexer.py Project: epfl-si/invenio-infoscience

def get_valid_range(rank_method_code):
    """Return a range of records"""
    write_message("Getting records from collections enabled for rank method.", verbose=9)

    res = run_sql("SELECT collection.name FROM collection, collection_rnkMETHOD, rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name=%s",  (rank_method_code, ))
    l_of_colls = []
    for coll in res:
        l_of_colls.append(coll[0])
    if len(l_of_colls) > 0:
        recIDs = perform_request_search(c=l_of_colls)
    else:
        recIDs = []
    valid = HitSet()
    valid += recIDs
    return valid

Example #6

0

Show file

File: bibrank_tag_based_indexer.py Project: epfl-si/invenio-infoscience

def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))
    input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r')
    data = input.readlines()
    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = HitSet(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = HitSet()
            newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if kb_data.has_key(value):
            if not rnkset.has_key(key):
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset

Example #7

0

Show file

 def test_calculate_record_relevance(self):
     """bibrank record sorter - calculating relevances"""
     hitset = HitSet()
     hitset += (1, 2, 5)
     self.assertEqual(({
         1: 7,
         2: 7,
         5: 5
     }, {
         1: 1,
         2: 1,
         5: 1
     }),
                      bibrank_record_sorter.calculate_record_relevance(
                          ("testterm", 2.0), {
                              "Gi": (0, 50.0),
                              1: (3, 4.0),
                              2: (4, 5.0),
                              5: (1, 3.5)
                          }, hitset, {}, {}, 0, None))

Example #8

0

Show file

File: websearch_webcoll.py Project: metandrey/invenio-metandrey

 def calculate_reclist(self):
     """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection."""
     if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"):
         # do we have to recalculate?
         return (self.reclist, self.reclist_with_nonpublic_subcolls)
     write_message("... calculating reclist of %s" % self.name, verbose=6)
     reclist = HitSet() # will hold results for public sons only; good for storing into DB
     reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total
                                                # number of documents
     if not self.dbquery:
         # A - collection does not have dbquery, so query recursively all its sons
         #     that are either non-restricted or that have the same restriction rules
         for coll in self.get_sons():
             coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
             if ((coll.restricted_p() is None) or
                 (coll.restricted_p() == self.restricted_p())):
                 # add this reclist ``for real'' only if it is public
                 reclist.union_update(coll_reclist)
             reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
     else:
         # B - collection does have dbquery, so compute it:
         #     (note: explicitly remove DELETED records)
         if CFG_CERN_SITE:
             reclist = search_pattern(None, self.dbquery + \
                                      ' -980__:"DELETED" -980__:"DUMMY"')
         else:
             reclist = search_pattern(None, self.dbquery + ' -980__:"DELETED"')
         reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
     # store the results:
     self.nbrecs = len(reclist_with_nonpublic_subcolls)
     self.reclist = reclist
     self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
     # last but not least, update the speed-up flag:
     self.calculate_reclist_run_already = 1
     # return the two sets:
     return (self.reclist, self.reclist_with_nonpublic_subcolls)

Example #9

0

Show file

File: websearch_webcoll.py Project: lbjay/cds-invenio

 def calculate_reclist(self):
     """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection."""
     if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"):
         # do we have to recalculate?
         return (self.reclist, self.reclist_with_nonpublic_subcolls)
     write_message("... calculating reclist of %s" % self.name, verbose=6)
     reclist = HitSet() # will hold results for public sons only; good for storing into DB
     reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total
                                                # number of documents
     if not self.dbquery:
         # A - collection does not have dbquery, so query recursively all its sons
         #     that are either non-restricted or that have the same restriction rules
         for coll in self.get_sons():
             coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
             if ((coll.restricted_p() is None) or
                 (coll.restricted_p() == self.restricted_p())):
                 # add this reclist ``for real'' only if it is public
                 reclist.union_update(coll_reclist)
             reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
     else:
         # B - collection does have dbquery, so compute it:
         #     (note: explicitly remove DELETED records)
         if CFG_CERN_SITE:
             reclist = search_pattern(None, self.dbquery + \
                                      ' -collection:"DELETED" -collection:"DUMMY"')
         else:
             reclist = search_pattern(None, self.dbquery + ' -collection:"DELETED"')
         reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
     # store the results:
     self.nbrecs = len(reclist_with_nonpublic_subcolls)
     self.reclist = reclist
     self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
     # last but not least, update the speed-up flag:
     self.calculate_reclist_run_already = 1
     # return the two sets:
     return (self.reclist, self.reclist_with_nonpublic_subcolls)