Ejemplo n.º 1
0
def eprints():
    total = 0
    fermilab = get_collection_reclist('Fermilab')
    print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%')
    date_range = ['1904', '1905', '1906']
    #date_range = range(1, 20)
    for yymm in date_range:
        yymm = str(yymm)
        if len(yymm) == 1:
            yymm = '0' + yymm
        search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \
                   yymm + '*"'
        search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"'
        x = perform_request_search(p=search, cc='HEP')
        search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"'
        y = perform_request_search(p=search, cc='HEP')
        x_f = intbitset(x) & fermilab
        y_f = intbitset(y) & fermilab
        length = len(x) + len(y)
        length_f = len(x_f) + len(y_f)
        try:
            ratio = float(length_f) / float(length) * 100.0
        except ZeroDivisionError:
            ratio = 0
        print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f,
                                                   ratio)
        total += length
    print "Total =", total
Ejemplo n.º 2
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            if name.strip():
                result_hitset = search_pattern(p=name,
                                               f=self.institution_name_field)
                institution_hitset = result_hitset & institution_collection_hitset
                institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
def get_normalized_ranking_scores(response, hitset_filter = None, recids = []):
    """
    Returns the result having normalized ranking scores, interval [0, 100].
    hitset_filter - optional filter for the results
    recids - optional recids that shall remain in the result despite the filter
    """
    if not len(response.results):
        return ([], intbitset())

    # response.maxScore does not work in case of something was added to the response
    max_score = float(response.results[0]['score'])
    ranked_result = []
    matched_recs = intbitset()

    for hit in response.results:
        recid = int(hit['id'])

        if (not hitset_filter and hitset_filter != []) or recid in hitset_filter or recid in recids:
            normalised_score = 0
            if max_score > 0:
                normalised_score = int(100.0 / max_score * float(hit['score']))
            ranked_result.append((recid, normalised_score))
            matched_recs.add(recid)

    ranked_result.reverse()

    return (ranked_result, matched_recs)
    def tmpl_citations_box(self, summarize_records, pubs, ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if CFG_INSPIRE_SITE:
            addition = ' (from papers in INSPIRE)'
        else:
            addition = ''
        line1 = "<strong>" + _("Citations%s" % addition) + "</strong>"
        if not loading:
            summarize_records, rec_query = summarize_records
            for i in summarize_records[0].keys():
                summarize_records[0][i] = intbitset(summarize_records[0][i])

            str_buffer = StringIO()
            render_citation_summary(str_buffer, ln, intbitset(pubs),
                                    stats=summarize_records,
                                    searchpattern=rec_query,
                                    searchfield="")
            str_buffer.write(websearch_templates.tmpl_citesummary_footer())
            line2 = str_buffer.getvalue()
            line2 = '<span style="white-space: nowrap;">' + line2 + "</span>"
        else:
            line2 = self.loading_html()
        if add_box:
            citations_box = self.tmpl_print_searchresultbox('citations', line1, line2)
            return citations_box
        else:
            return line2
def generate_list_to_send(search):
    '''
    Generate a list to send to MSNET.
    '''

    filename = 'tmp_' + __file__
    filename = re.sub('.py', '_send.txt', filename)
    output = open(filename,'w')

    recids_nomatch = find_recids_nomatch()

    print search
    result_m = perform_request_search(p=search, cc='HEP')
    print search, len(result_m)
    search = "035__9:msnet"
    result_i = perform_request_search(p=search, cc='HEP')
    search = "0247_2:doi"
    result_d = perform_request_search(p=search, cc='HEP')
    result = intbitset(result_m) & intbitset(result_d) - intbitset(result_i)
    result = result - intbitset(recids_nomatch)
    for recid in result:
        try:
            doi = get_fieldvalues(recid, '0247_a')[0]
        except IndexError:
            print 'Problem with:', recid, doi
            break
        output.write(str(recid) + ',' + doi + '\n')
    output.close()
    print filename
Ejemplo n.º 6
0
def filter_out_based_on_date_range(recids, fromdate="", untildate="", set_spec=None):
    """ Filter out recids based on date range."""
    if fromdate:
        fromdate = normalize_date(fromdate, "T00:00:00Z")
    else:
        fromdate = get_earliest_datestamp()
    fromdate = utc_to_localtime(fromdate)

    if untildate:
        untildate = normalize_date(untildate, "T23:59:59Z")
    else:
        untildate = get_latest_datestamp()
    untildate = utc_to_localtime(untildate)

    if set_spec is not None: ## either it has a value or it empty, thus meaning all records
        last_updated = get_set_last_update(set_spec)
        if last_updated is not None:
            last_updated = utc_to_localtime(last_updated)
            if last_updated > fromdate:
                fromdate = utc_to_localtime(get_earliest_datestamp())

    recids = intbitset(recids) ## Let's clone :-)

    if fromdate and untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate)))
    elif fromdate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, )))
    elif untildate:
        recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, )))
    return recids - get_all_restricted_recids()
    def test_get_ranked_larger_hitset(self):
        """solrutils - ranking larger hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual(tuple(), self._get_ranked_result_sequence(query='Willnotfind', hitset=hitset))

        hitset = intbitset.intbitset([47, 56, 55, 56, 58, 68, 85, 89])
        self.assertEqual((55, 56), self._get_ranked_result_sequence(query='"higgs boson"', hitset=hitset))
Ejemplo n.º 8
0
def get_data_for_definition_marc(tags, recids):
    '''Having a list of tags and a list of recids, it returns a dictionary
    with the values correspondig to the tags'''
    #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x]
    #user: 140s, sys: 21s, total: 160s - cdsdev
    if isinstance(recids, (int, long)):
        recids = intbitset([recids, ])
    # for each recid we need only one value
    #on which we sort, so we can stop looking for a value
    # as soon as we find one
    tag_index = 0
    field_data_dict = {}
    while len(recids) > 0 and tag_index < len(tags):
        write_message('%s records queried for values for tags %s.' \
                      %(len(recids), tags), verbose=5)
        res = _get_values_from_marc_tag(tags[tag_index], recids)
        res_dict = dict(res)
        #field_data_dict.update(res_dict)
        #we can not use this, because res_dict might contain recids
        #that are already in field_data_dict, and we should not overwrite their value
        field_data_dict = dict(res_dict, **field_data_dict)
        #there might be keys that we do not want (ex: using 'between')
        #so we should remove them
        res_dict_keys = intbitset(res_dict.keys())
        recids_not_needed = res_dict_keys.difference(recids)
        for recid in recids_not_needed:
            del field_data_dict[recid]
        #update the recids to contain only the recid that do not have values yet
        recids.difference_update(res_dict_keys)
        tag_index += 1
    return field_data_dict
Ejemplo n.º 9
0
 def test_set_consistence(self):
     """intbitset - set consistence"""
     tests = (
         (
             (20, 30, 1000, 40),
             'x\x9cc`\x10p``d\x18\x18\x80d/\x00*\xb6\x00S',
             'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF'
         ),
         (
             (20, 30, 1000, 41),
             'x\x9cc`\x10p``b\x18\x18\xc0\x88`\x02\x00+9\x00T',
             'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF'
         ),
         (
             (20, 30, 1001, 41),
             'x\x9cc`\x10p``b\x18\x18\x80d/\x00+D\x00U',
             'x\x9cc`\x10p`\x18(\xf0\xef?\x1c\x00\x00k\xdb\x0bE'
         )
     )
     for original, dumped, dumped_trails in tests:
         intbitset1 = intbitset(original)
         intbitset2 = intbitset(original, trailing_bits=True)
         intbitset3 = intbitset(dumped)
         intbitset4 = intbitset(dumped_trails)
         self._helper_sanity_test(intbitset1)
         self._helper_sanity_test(intbitset2)
         self._helper_sanity_test(intbitset3)
         self._helper_sanity_test(intbitset4)
         self.assertEqual(intbitset1.fastdump(), dumped)
         self.assertEqual(intbitset1, intbitset3)
         self.assertEqual(intbitset2.fastdump(), dumped_trails)
         self.assertEqual(intbitset2, intbitset4)
def get_normalized_ranking_scores(response, hitset_filter = None, recids = []):
    """
    Returns the result having normalized ranking scores, interval [0, 100].
    hitset_filter - optional filter for the results
    recids - optional recids that shall remain in the result despite the filter
    """
    if not len(response.results):
        return ([], intbitset())

    # response.maxScore does not work in case of something was added to the response
    max_score = float(response.results[0]['score'])
    ranked_result = []
    matched_recs = intbitset()

    for hit in response.results:
        recid = int(hit['id'])

        if (not hitset_filter and hitset_filter != []) or recid in hitset_filter or recid in recids:
            normalised_score = 0
            if max_score > 0:
                normalised_score = int(100.0 / max_score * float(hit['score']))
            ranked_result.append((recid, normalised_score))
            matched_recs.add(recid)

    ranked_result.reverse()

    return (ranked_result, matched_recs)
Ejemplo n.º 11
0
 def test_pickling(self):
     """intbitset - pickling"""
     import cPickle
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1), cPickle.loads(cPickle.dumps(intbitset(set1), -1)))
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1, trailing_bits=True), cPickle.loads(cPickle.dumps(intbitset(set1, trailing_bits=True), -1)))
Ejemplo n.º 12
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=params['collection'],
                                       p=params['pattern'],
                                       f=params['field']))
    return res
Ejemplo n.º 13
0
def process_affiliations(record_ids=None, all_records=False):
    name = 'affiliations'

    if all_records:
        records = intbitset(run_sql("SELECT id FROM bibrec"))
        start_time = datetime.now()
    elif record_ids:
        records = intbitset(record_ids)
        start_time = None
    else:
        dummy_last_recid, last_updated = fetch_last_updated(name)
        start_time = datetime.now()
        sql = """SELECT `id` FROM `bibrec`
                 WHERE `modification_date` >= %s
                 AND `modification_date` <= %s
                 ORDER BY `modification_date`"""
        records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()]))

    records_iter = iter(records)
    processed_records_count = 0
    while True:
        task_sleep_now_if_required()
        chunk = list(islice(records_iter, CHUNK_SIZE))
        if not chunk:
            break
        process_and_store(chunk)
        processed_records_count += len(chunk)
        task_update_progress('processed %s out of %s records' % (processed_records_count, len(records)))
    if start_time:
        store_last_updated(None, start_time, name)
Ejemplo n.º 14
0
 def test_pickling(self):
     """intbitset - pickling"""
     import cPickle
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1), cPickle.loads(cPickle.dumps(intbitset(set1), -1)))
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1, trailing_bits=True), cPickle.loads(cPickle.dumps(intbitset(set1, trailing_bits=True), -1)))
Ejemplo n.º 15
0
    def test_get_ranked_larger_hitset(self):
        """solrutils - ranking larger hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual(tuple(), self._get_ranked_result_sequence(query='Willnotfind', hitset=hitset))

        hitset = intbitset.intbitset([47, 56, 55, 56, 58, 68, 85, 89])
        self.assertEqual((55, 56), self._get_ranked_result_sequence(query='"higgs boson"', hitset=hitset))
Ejemplo n.º 16
0
def query_records(params):
    """Prduces record IDs from given query parameters

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
Ejemplo n.º 17
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids |= intbitset(new_recids)

    return recids
Ejemplo n.º 18
0
def missing_caches(fmt, chunk_size=100000):
    """Produces record IDs to be formated, because their fmt cache is missing

    @param fmt: format to query for
    @return: record IDs generator without pre-created format cache
    """
    write_message("Querying database for records without cache...")

    # https://mariadb.com/kb/en/library/subqueries-and-joins/
    # "select id from bibrec left join bibfmt on id=id_bibrec where id_bibrec is NULL" is slow
    # subquery is a lot faster here
    return intbitset(
        run_sql(
            'select id from bibrec where id not in (select id_bibrec from bibfmt where format=%s)',
            (fmt, )))

    all_recids = intbitset()
    max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0
    for start in xrange(1, max_id + 1, chunk_size):
        end = start + chunk_size
        sql = "SELECT id FROM bibrec WHERE id BETWEEN %s AND %s"
        recids = intbitset(run_sql(sql, (start, end)))
        sql = """SELECT id_bibrec FROM bibfmt
                 WHERE id_bibrec BETWEEN %s AND %s
                 AND format = %s"""
        without_fmt = intbitset(run_sql(sql, (start, end, fmt)))
        all_recids += recids - without_fmt

    return all_recids
Ejemplo n.º 19
0
    def setUp(self):
        self.sets = [[1024], [10, 20], [10, 40], [60, 70], [60, 80],
                     [10, 20, 60, 70], [10, 40, 60, 80], [1000], [10000],
                     [23, 45, 67, 89, 110, 130, 174, 1002, 2132, 23434],
                     [700, 2000],
                     range(1000, 1100), [30], [31], [32], [33], [62], [63],
                     [64], [65], [126], [127], [128], [129]]
        self.fncs_list = [
            (intbitset.__and__, set.__and__, int.__and__, False),
            (intbitset.__or__, set.__or__, int.__or__, False),
            (intbitset.__xor__, set.__xor__, int.__xor__, False),
            (intbitset.__sub__, set.__sub__, int.__sub__, False),
            (intbitset.__iand__, set.__iand__, int.__and__, True),
            (intbitset.__ior__, set.__ior__, int.__or__, True),
            (intbitset.__ixor__, set.__ixor__, int.__xor__, True),
            (intbitset.__isub__, set.__isub__, int.__sub__, True),
        ]

        self.cmp_list = [
            (intbitset.__eq__, set.__eq__, lambda x, y: cmp(x, y) == 0),
            (intbitset.__ge__, set.__ge__, lambda x, y: cmp(x, y) >= 0),
            (intbitset.__gt__, set.__gt__, lambda x, y: cmp(x, y) > 0),
            (intbitset.__le__, set.__le__, lambda x, y: cmp(x, y) <= 0),
            (intbitset.__lt__, set.__lt__, lambda x, y: cmp(x, y) < 0),
            (intbitset.__ne__, set.__ne__, lambda x, y: cmp(x, y) != 0),
        ]

        self.big_examples = [list(intbitset(CFG_INTBITSET_BIG_EXAMPLE))]

        self.corrupted_strdumps = [
            "ciao",
            intbitset([2, 6000000]).strbits(),
            "djflsdkfjsdljfsldkfjsldjlfk",
        ]
Ejemplo n.º 20
0
 def __init__(self, name=""):
     "Creates collection instance by querying the DB configuration database about 'name'."
     self.calculate_reclist_run_already = 0 # to speed things up without much refactoring
     self.update_reclist_run_already = 0 # to speed things up without much refactoring
     self.reclist_with_nonpublic_subcolls = intbitset()
     # used to store the temporary result of the calculation of nbrecs of an external collection
     self.nbrecs_tmp = None
     if not name:
         self.name = CFG_SITE_NAME # by default we are working on the home page
         self.id = 1
         self.dbquery = None
         self.nbrecs = None
         self.reclist = intbitset()
     else:
         self.name = name
         try:
             res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection
                               WHERE name=%s""", (name,))
             if res:
                 self.id = res[0][0]
                 self.name = res[0][1]
                 self.dbquery = res[0][2]
                 self.nbrecs = res[0][3]
                 try:
                     self.reclist = intbitset(res[0][4])
                 except:
                     self.reclist = intbitset()
             else: # collection does not exist!
                 self.id = None
                 self.dbquery = None
                 self.nbrecs = None
                 self.reclist = intbitset()
         except Error, e:
             print "Error %d: %s" % (e.args[0], e.args[1])
             sys.exit(1)
Ejemplo n.º 21
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids |= intbitset(new_recids)

    return recids
Ejemplo n.º 22
0
def filter_out_based_on_date_range(recids, fromdate="", untildate=""):
    """ Filter out recids based on date range."""
    if fromdate != "":
        fromdate = normalize_date(fromdate, "T00:00:00Z")
    else:
        fromdate = get_earliest_datestamp()
    fromdate = utc_to_localtime(fromdate)

    if untildate != "":
        untildate = normalize_date(untildate, "T23:59:59Z")
    else:
        untildate = get_latest_datestamp()

    untildate = utc_to_localtime(untildate)

    recids = intbitset(recids)  ## Let's clone :-)

    if fromdate and untildate:
        recids &= intbitset(
            run_sql(
                "SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s",
                (fromdate, untildate)))
    elif fromdate:
        recids &= intbitset(
            run_sql("SELECT id FROM bibrec WHERE modification_date >= %s",
                    (fromdate, )))
    elif untildate:
        recids &= intbitset(
            run_sql("SELECT id FROM bibrec WHERE modification_date <= %s",
                    (untildate, )))
    return recids - get_all_restricted_recids()
    def tmpl_citations_box(self, summarize_records, pubs, ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if CFG_INSPIRE_SITE:
            addition = ' (from papers in INSPIRE)'
        else:
            addition = ''
        line1 = "<strong>" + _("Citations%s" % addition) + "</strong>"
        if not loading:
            summarize_records, rec_query = summarize_records
            for i in summarize_records[0].keys():
                summarize_records[0][i] = intbitset(summarize_records[0][i])

            str_buffer = StringIO()
            render_citation_summary(str_buffer, ln, intbitset(pubs),
                                    stats=summarize_records,
                                    searchpattern=rec_query,
                                    searchfield="")
            str_buffer.write(websearch_templates.tmpl_citesummary_footer())
            line2 = str_buffer.getvalue()
            line2 = '<span style="white-space: nowrap;">' + line2 + "</span>"
        else:
            line2 = self.loading_html()
        if add_box:
            citations_box = self.tmpl_print_searchresultbox('citations', line1, line2)
            return citations_box
        else:
            return line2
Ejemplo n.º 24
0
def eprints():
    total = 0
    fermilab = get_collection_reclist('Fermilab')
    print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA',
                                                   '%')
    date_range = ['1901', '1902', '1903']
    date_range = range(1, 20)
    for yymm in date_range:
        yymm = str(yymm)
        if len(yymm) == 1:
            yymm = '0' + yymm
        search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \
                   yymm + '*"'
        search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"'
        x = perform_request_search(p=search, cc='HEP')
        search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"'
        y = perform_request_search(p=search, cc='HEP')
        x_f = intbitset(x) & fermilab
        y_f = intbitset(y) & fermilab
        length = len(x) + len(y)
        length_f = len(x_f) + len(y_f)
        try:
            ratio = float(length_f)/float(length)*100.0
        except ZeroDivisionError:
            ratio = 0
        print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, 
                                                   ratio)
        total += length
    print "Total =", total
Ejemplo n.º 25
0
def generate_list_to_send(search):
    '''
    Generate a list to send to MSNET.
    '''

    filename = 'tmp_' + __file__
    filename = re.sub('.py', '_send.txt', filename)
    output = open(filename, 'w')

    recids_nomatch = find_recids_nomatch()

    print search
    result_m = perform_request_search(p=search, cc='HEP')
    print search, len(result_m)
    search = "035__9:msnet"
    result_i = perform_request_search(p=search, cc='HEP')
    search = "0247_2:doi"
    result_d = perform_request_search(p=search, cc='HEP')
    result = intbitset(result_m) & intbitset(result_d) - intbitset(result_i)
    result = result - intbitset(recids_nomatch)
    for recid in result:
        try:
            doi = get_fieldvalues(recid, '0247_a')[0]
        except IndexError:
            print 'Problem with:', recid, doi
            break
        output.write(str(recid) + ',' + doi + '\n')
    output.close()
    print filename
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as intbitset) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = intbitset()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(
            c=[coll.strip() for coll in set_def["c"].split(",")],
            p1=set_def["p1"],
            f1=set_def["f1"],
            m1=set_def["m1"],
            op1=set_def["op1"],
            p2=set_def["p2"],
            f2=set_def["f2"],
            m2=set_def["m2"],
            op2=set_def["op2"],
            p3=set_def["p3"],
            f3=set_def["f3"],
            m3=set_def["m3"],
            ap=0,
        )

        recids |= intbitset(new_recids)

    return recids
Ejemplo n.º 27
0
def main(key, value, start, end):
    '''Add up all citations over a period.'''

    search = 'find {0} {1} and topcite 1+'.format(key, value)
    if key == 'exp':
        search = 'find {0} {1}* and topcite 1+'.format(key, value)
    entity_papers = intbitset(perform_request_search(p=search, cc='HEP'))
    citation_list = get_cited_by_list(entity_papers)
    citation_dict = dict(
        (cite[0], intbitset(cite[1])) for cite in citation_list)
    print 'The {0} papers of {1}'.format(len(entity_papers), value)

    all_papers = {}
    years = range(start, end)
    for year in years:
        search = 'earliestdate:' + str(year)
        all_papers[year] = intbitset(perform_request_search(p=search,
                                                            cc='HEP'))
    citations_year = {}
    total = 0
    for year in years:
        citations_year[year] = 0
        for entity_paper in entity_papers:
            citations_year[year] += len(citation_dict[entity_paper]
                                        & all_papers[year])
        total += citations_year[year]
        print '{0:6d}\t{1:6d}\t{2:6d}'.format(year, citations_year[year],
                                              total)
Ejemplo n.º 28
0
def outdated_caches(fmt, last_updated, chunk_size=2000000):
    sql = """SELECT br.id
             FROM bibrec AS br
             INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id
             WHERE br.modification_date >= %s
             AND bf.format = %s
             AND bf.last_updated < br.modification_date
             AND br.id BETWEEN %s AND %s"""

    random.seed()
    if random.random() < 0.98:
        tdelta = timedelta(hours=4)
    else:
        tdelta = timedelta(days=365)

    last_updated_str = (last_updated - tdelta).strftime('%Y-%m-%d %H:%M:%S')
    write_message("Querying database for outdated cache since %s" %
                  last_updated_str)
    recids = intbitset()
    max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0
    for start in xrange(1, max_id + 1, chunk_size):
        end = start + chunk_size
        recids += intbitset(run_sql(sql, (last_updated_str, fmt, start, end)))

    return recids
Ejemplo n.º 29
0
 def test_no_segmentation_fault(self):
     """intbitset - test no segmentation fault with foreign data types"""
     for intbitset_fnc, set_fnc, dummy, dummy in self.fncs_list:
         self.assertRaises(TypeError, intbitset_fnc,
                           (intbitset([1, 2, 3]), set([1, 2, 3])))
         self.assertRaises(TypeError, set_fnc,
                           (set([1, 2, 3]), intbitset([1, 2, 3])))
Ejemplo n.º 30
0
 def test_set_consistence(self):
     """intbitset - set consistence"""
     tests = (
         (
             (20, 30, 1000, 40),
             'x\x9cc`\x10p``d\x18\x18\x80d/\x00*\xb6\x00S',
             'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF'
         ),
         (
             (20, 30, 1000, 41),
             'x\x9cc`\x10p``b\x18\x18\xc0\x88`\x02\x00+9\x00T',
             'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF'
         ),
         (
             (20, 30, 1001, 41),
             'x\x9cc`\x10p``b\x18\x18\x80d/\x00+D\x00U',
             'x\x9cc`\x10p`\x18(\xf0\xef?\x1c\x00\x00k\xdb\x0bE'
         )
     )
     for original, dumped, dumped_trails in tests:
         intbitset1 = intbitset(original)
         intbitset2 = intbitset(original, trailing_bits=True)
         intbitset3 = intbitset(dumped)
         intbitset4 = intbitset(dumped_trails)
         self._helper_sanity_test(intbitset1)
         self._helper_sanity_test(intbitset2)
         self._helper_sanity_test(intbitset3)
         self._helper_sanity_test(intbitset4)
         self.assertEqual(intbitset1.fastdump(), dumped)
         self.assertEqual(intbitset1, intbitset3)
         self.assertEqual(intbitset2.fastdump(), dumped_trails)
         self.assertEqual(intbitset2, intbitset4)
Ejemplo n.º 31
0
    def test_set_getitem(self):
        """intbitset - __getitem__"""
        for set1 in self.sets + [[]]:
            intbitset1 = intbitset(set1)
            pythonlist1 = list(set1)
            for i in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2):
                try:
                    res1 = pythonlist1[i]
                except IndexError:
                    self.assertRaises(IndexError, intbitset1.__getitem__, i)
                    continue
                res2 = intbitset1[i]
                self.assertEqual(res1, res2)

        for set1 in self.sets + [[]]:
            intbitset1 = intbitset(set1)
            pythonlist1 = list(set1)
            for start in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2):
                for stop in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2):
                    for step in xrange(1, 3):
                        res1 = pythonlist1[start:stop:step]
                        res2 = intbitset1[start:stop:step]
                        self.assertEqual(
                            res1, list(res2),
                            "Failure with set %s, start %s, stop %s, step %s, found %s, expected %s, indices: %s"
                            % (set1, start, stop, step, list(res2), res1,
                               slice(start, stop, step).indices(
                                   len(pythonlist1))))
Ejemplo n.º 32
0
    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids
Ejemplo n.º 33
0
    def tokenize_for_phrases(self, recID):
        """Get the country names and country codes of the institutions
           affiliated with the authors of the publication
        """

        # Get the name of the institution affiliated
        institution_names = []
        for tag in self.institution_tags:
            institution_names += get_fieldvalues(recID, tag)

        # Get the hitset of all the institutes
        institution_collection_hitset = intbitset([])
        for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS:
            institution_collection_hitset += get_collection_reclist(collection)

        # Search for the institution name and get a list of institution ids
        institution_ids = intbitset([])
        for name in institution_names:
            result_hitset = search_pattern(
                p=name,
                f=self.institution_name_field
            )
            institution_hitset = result_hitset & institution_collection_hitset
            institution_ids += list(institution_hitset)

        # Get the country tokens
        tokens = []
        for instID in institution_ids:
            tokens += self._tokenize_from_country_name_tag(instID)
            tokens += self._tokenize_from_country_code_tag(instID)

        # Remove duplicates
        tokens = list(set(tokens))

        return tokens
def get_records_with_num_cites(numstr, allrecs=intbitset([])):
    """Return an intbitset of record IDs that are cited X times,
       X defined in numstr.
       Warning: numstr is string and may not be numeric! It can
       be 10,0->100 etc
    """
    cache_cited_by_dictionary = get_citation_dict("citationdict")
    cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys")
    cache_cited_by_dictionary_keys_intbitset = get_citation_dict(
        "citationdict_keys_intbitset")
    matches = intbitset([])
    #once again, check that the parameter is a string
    if not (type(numstr) == type("thisisastring")):
        return intbitset([])
    numstr = numstr.replace(" ", '')
    numstr = numstr.replace('"', '')

    num = 0
    #first, check if numstr is just a number
    singlenum = re.findall("(^\d+$)", numstr)
    if singlenum:
        num = int(singlenum[0])
        if num == 0:
            #we return recids that are not in keys
            return allrecs - cache_cited_by_dictionary_keys_intbitset
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) == num:
                matches.add(k)
        return matches

    #try to get 1->10 or such
    firstsec = re.findall("(\d+)->(\d+)", numstr)
    if firstsec:
        first = 0
        sec = -1
        try:
            first = int(firstsec[0][0])
            sec = int(firstsec[0][1])
        except:
            return intbitset([])
        if (first == 0):
            #start with those that have no cites..
            matches = allrecs - cache_cited_by_dictionary_keys_intbitset
        if (first <= sec):
            for k in cache_cited_by_dictionary_keys:
                li = cache_cited_by_dictionary[k]
                if len(li) >= first:
                    if len(li) <= sec:
                        matches.add(k)
            return matches

    firstsec = re.findall("(\d+)\+", numstr)
    if firstsec:
        first = firstsec[0]
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) > int(first):
                matches.add(k)
    return matches
Ejemplo n.º 35
0
def get_citedby_hitset(ahitset, input_limit=None):
    """
    Return a hitset of records that are cited by records in the given
    ahitset. Useful for search engine's citedby:author:ellis feature.

    The parameter 'input_limit' is the maximum number of records of 'ahitset'
    to consider. If it is None (the default value) all the records will be
    used.
    """
    out = intbitset()
    if ahitset:
        try:
            iter(ahitset)
        except OverflowError:
            # ignore attempt to iterate over infinite ahitset
            pass
        else:
            # We don't want to overwrite the input parameter
            if input_limit is not None:
                limited_ahitset = ahitset[:input_limit]
            else:
                limited_ahitset = ahitset

            in_sql = ','.join('%s' for dummy in limited_ahitset)
            rows = run_sql(
                """SELECT citee FROM rnkCITATIONDICT
                              WHERE citer IN (%s)""" % in_sql, limited_ahitset)
            out = intbitset(rows)
    return out
Ejemplo n.º 36
0
def get_data_for_definition_marc(tags, recids):
    '''Having a list of tags and a list of recids, it returns a dictionary
    with the values correspondig to the tags'''
    #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x]
    #user: 140s, sys: 21s, total: 160s - cdsdev
    if isinstance(recids, (int, long)):
        recids = intbitset([
            recids,
        ])
    # for each recid we need only one value
    #on which we sort, so we can stop looking for a value
    # as soon as we find one
    tag_index = 0
    field_data_dict = {}
    while len(recids) > 0 and tag_index < len(tags):
        write_message('%s records queried for values for tags %s.' \
                      %(len(recids), tags), verbose=5)
        res = _get_values_from_marc_tag(tags[tag_index], recids)
        res_dict = dict(res)
        #field_data_dict.update(res_dict)
        #we can not use this, because res_dict might contain recids
        #that are already in field_data_dict, and we should not overwrite their value
        field_data_dict = dict(res_dict, **field_data_dict)
        #there might be keys that we do not want (ex: using 'between')
        #so we should remove them
        res_dict_keys = intbitset(res_dict.keys())
        recids_not_needed = res_dict_keys.difference(recids)
        for recid in recids_not_needed:
            del field_data_dict[recid]
        #update the recids to contain only the recid that do not have values yet
        recids.difference_update(res_dict_keys)
        tag_index += 1
    return field_data_dict
Ejemplo n.º 37
0
def unlinked(req):
    """
    Return an id-ordered list of citation log entries of at most 10000
    rows.
    """
    from invenio.dbquery import run_sql
    from invenio.search_engine import get_fieldvalues, get_collection_reclist
    useful_personids1 = intbitset(run_sql("SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'"))
    useful_personids2 = intbitset(run_sql("SELECT distinct personid from aidPERSONIDPAPERS where flag=2"))
    linked_personids = intbitset(run_sql("SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'"))
    names = dict(run_sql("SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'"))
    matched_names = [name.lower().strip() for name in get_fieldvalues(get_collection_reclist('HepNames'), '035__a')]
    personid_to_match = (useful_personids1 | useful_personids2) - linked_personids

    body = ['<ol>']
    for personid in personid_to_match:
        name = names.get(personid, str(personid))
        if name.lower().strip() in matched_names:
            continue
        body.append('<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>' % {
                'siteurl': escape(CFG_SITE_SECURE_URL, True),
                'bai': escape(name, True)})
    body.append('</ol>')
    body = '\n'.join(body)

    return page(req=req, body=body, title="Unlinked useful BAIs")
    def tmpl_papers_box(self, req, pubs, bibauthorid_data, num_downloads, ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if not loading and pubs:
            ib_pubs = intbitset(pubs)
            if bibauthorid_data["cid"]:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"])
            elif bibauthorid_data["pid"] > -1:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"])
            baid_query = baid_query + " "

            rec_query = baid_query
            searchstr = create_html_link(websearch_templates.build_search_url(p=rec_query),
                                         {}, "<strong>" + "All papers (" + str(len(pubs)) + ")" + "</strong>",)

            line2 = searchstr

            if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads:
                line2 += " (" + _("downloaded") + " "
                line2 += str(num_downloads) + " " + _("times") + ")"

            if CFG_INSPIRE_SITE:
                CFG_COLLS = ['Book',
                             'ConferencePaper',
                             'Introductory',
                             'Lectures',
                             'Preprint',
                             'Published',
                             'Review',
                             'Thesis']
            else:
                CFG_COLLS = ['Article',
                             'Book',
                             'Preprint', ]
            collsd = {}
            for coll in CFG_COLLS:
                coll_papers = list(ib_pubs & intbitset(perform_request_search(rg=0, f="collection", p=coll)))
                if coll_papers:
                    collsd[coll] = coll_papers
            colls = collsd.keys()
            colls.sort(lambda x, y: cmp(len(collsd[y]), len(collsd[x]))) # sort by number of papers
            for coll in colls:
                rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll)
                line2 += "<br />" + create_html_link(websearch_templates.build_search_url(p=rec_query),
                                                                           {}, coll + " (" + str(len(collsd[coll])) + ")",)

        elif not pubs and not loading:
            line2 = _("No Papers")

        elif loading:
            line2 = self.loading_html()

        else:
            line2 = 'This is a bug and should be corrected'

        if not add_box:
            return line2
        line1 = "<strong>" + _("Papers") + "</strong>"
        papers_box = self.tmpl_print_searchresultbox("papers", line1, line2)
        return papers_box
def get_records_with_num_cites(numstr, allrecs = intbitset([])):
    """Return an intbitset of record IDs that are cited X times,
       X defined in numstr.
       Warning: numstr is string and may not be numeric! It can
       be 10,0->100 etc
    """
    cache_cited_by_dictionary = get_citation_dict("citationdict")
    cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys")
    cache_cited_by_dictionary_keys_intbitset = get_citation_dict("citationdict_keys_intbitset")
    matches = intbitset([])
    #once again, check that the parameter is a string
    if not (type(numstr) == type("thisisastring")):
        return intbitset([])
    numstr = numstr.replace(" ",'')
    numstr = numstr.replace('"','')

    num = 0
    #first, check if numstr is just a number
    singlenum = re.findall("(^\d+$)", numstr)
    if singlenum:
        num = int(singlenum[0])
        if num == 0:
            #we return recids that are not in keys
            return allrecs - cache_cited_by_dictionary_keys_intbitset
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) == num:
                matches.add(k)
        return matches

    #try to get 1->10 or such
    firstsec = re.findall("(\d+)->(\d+)", numstr)
    if firstsec:
        first = 0
        sec = -1
        try:
            first = int(firstsec[0][0])
            sec = int(firstsec[0][1])
        except:
            return intbitset([])
        if (first == 0):
            #start with those that have no cites..
            matches = allrecs - cache_cited_by_dictionary_keys_intbitset
        if (first <= sec):
            for k in cache_cited_by_dictionary_keys:
                li = cache_cited_by_dictionary[k]
                if len(li) >= first:
                    if len(li) <= sec:
                        matches.add(k)
            return matches

    firstsec = re.findall("(\d+)\+", numstr)
    if firstsec:
        first = firstsec[0]
        for k in cache_cited_by_dictionary_keys:
            li = cache_cited_by_dictionary[k]
            if len(li) > int(first):
                matches.add(k)
    return matches
Ejemplo n.º 40
0
    def tmpl_papers_box(self, req, pubs, bibauthorid_data, num_downloads, ln, add_box=True, loading=False):
        _ = gettext_set_language(ln)
        if not loading and pubs:
            ib_pubs = intbitset(pubs)
            if bibauthorid_data["cid"]:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"])
            elif bibauthorid_data["pid"] > -1:
                baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"])
            baid_query = baid_query + " "

            rec_query = baid_query
            searchstr = create_html_link(websearch_templates.build_search_url(p=rec_query),
                                         {}, "<strong>" + "All papers (" + str(len(pubs)) + ")" + "</strong>",)

            line2 = searchstr

            if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads:
                line2 += " (" + _("downloaded") + " "
                line2 += str(num_downloads) + " " + _("times") + ")"

            if CFG_INSPIRE_SITE:
                CFG_COLLS = ['Book',
                             'ConferencePaper',
                             'Introductory',
                             'Lectures',
                             'Preprint',
                             'Published',
                             'Review',
                             'Thesis']
            else:
                CFG_COLLS = ['Article',
                             'Book',
                             'Preprint', ]
            collsd = {}
            for coll in CFG_COLLS:
                coll_papers = list(ib_pubs & intbitset(perform_request_search(rg=0, f="collection", p=coll)))
                if coll_papers:
                    collsd[coll] = coll_papers
            colls = collsd.keys()
            colls.sort(lambda x, y: cmp(len(collsd[y]), len(collsd[x]))) # sort by number of papers
            for coll in colls:
                rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll)
                line2 += "<br />" + create_html_link(websearch_templates.build_search_url(p=rec_query),
                                                                           {}, coll + " (" + str(len(collsd[coll])) + ")",)

        elif not pubs and not loading:
            line2 = _("No Papers")

        elif loading:
            line2 = self.loading_html()

        else:
            line2 = 'This is a bug and should be corrected'

        if not add_box:
            return line2
        line1 = "<strong>" + _("Papers") + "</strong>"
        papers_box = self.tmpl_print_searchresultbox("papers", line1, line2)
        return papers_box
Ejemplo n.º 41
0
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, )))
            compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, )))
                modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d'
                                                      ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Ejemplo n.º 42
0
 def test_tuple_of_tuples(self):
     """intbitset - support tuple of tuples"""
     for set1 in self.sets + [[]]:
         tmp_tuple = tuple([(elem, ) for elem in set1])
         self.assertEqual(list(intbitset(set1)), list(intbitset(tmp_tuple)))
     for set1 in self.sets + [[]]:
         tmp_tuple = tuple([(elem, ) for elem in set1])
         self.assertEqual(intbitset(set1, trailing_bits=True), intbitset(tmp_tuple, trailing_bits=True))
Ejemplo n.º 43
0
 def test_set_repr(self):
     """intbitset - Pythonic representation"""
     for set1 in self.sets + [[]]:
         intbitset1 = intbitset(set1)
         self.assertEqual(intbitset1, eval(repr(intbitset1)))
     for set1 in self.sets + [[]]:
         intbitset1 = intbitset(set1, trailing_bits=True)
         self.assertEqual(intbitset1, eval(repr(intbitset1)))
def get_institution_ids(text):
    # HACK: I know... I am sorry for that. It's for a good cause
    # FIXME: use redis
    global INSTITUTION_CACHE
    if text not in INSTITUTION_CACHE:
        INSTITUTION_CACHE[text] = intbitset(perform_request_search(cc='Institutions', p='110__u:"%s"' % text)) or \
            intbitset(perform_request_search(cc='Institutions', p='110__t:"%s"' % text))
    return INSTITUTION_CACHE[text]
Ejemplo n.º 45
0
 def get_records_for_user(qid, uid):
     key = get_search_results_cache_key_from_qid(qid)
     data = search_results_cache.get(key)
     if data is None:
         return intbitset([])
     cc = search_results_cache.get(key + '::cc')
     return get_records_that_can_be_displayed(current_user,
                                              intbitset().fastload(data), cc)
Ejemplo n.º 46
0
    def test_get_ranked_smaller_hitset(self):
        """solrutils - ranking smaller hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual((47, 56, 58, 68, 89, 85), self._get_ranked_result_sequence(query='higgs', hitset=hitset))

        hitset = intbitset.intbitset([45, 50, 61, 74, 94])
        self.assertEqual((50, 61, 74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset))
        self.assertEqual((74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset, rows=3))
    def test_get_ranked_smaller_hitset(self):
        """solrutils - ranking smaller hitset"""
        hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89])
        self.assertEqual((47, 56, 58, 68, 89, 85), self._get_ranked_result_sequence(query='higgs', hitset=hitset))

        hitset = intbitset.intbitset([45, 50, 61, 74, 94])
        self.assertEqual((50, 61, 74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset))
        self.assertEqual((74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset, rows=3))
Ejemplo n.º 48
0
 def test_tuple_of_tuples(self):
     """intbitset - support tuple of tuples"""
     for set1 in self.sets + [[]]:
         tmp_tuple = tuple([(elem, ) for elem in set1])
         self.assertEqual(list(intbitset(set1)), list(intbitset(tmp_tuple)))
     for set1 in self.sets + [[]]:
         tmp_tuple = tuple([(elem, ) for elem in set1])
         self.assertEqual(intbitset(set1, trailing_bits=True), intbitset(tmp_tuple, trailing_bits=True))
Ejemplo n.º 49
0
def find_records():
    '''Looks for candidate records.'''
    search = "find fc g not fc m not fc t and tc p and jy " + str(YEAR)
    result_m = perform_request_search(p=search, cc='HEP')
    search = "035__9:msnet"
    result_i = perform_request_search(p=search, cc='HEP')
    result = intbitset(result_m) - intbitset(result_i)
    return result
Ejemplo n.º 50
0
 def test_set_repr(self):
     """intbitset - Pythonic representation"""
     for set1 in self.sets + [[]] + self.big_examples:
         intbitset1 = intbitset(set1)
         self.assertEqual(intbitset1, eval(repr(intbitset1)))
     for set1 in self.sets + [[]] + self.big_examples:
         intbitset1 = intbitset(set1, trailing_bits=True)
         self.assertEqual(intbitset1, eval(repr(intbitset1)))
def get_institution_ids(text):
    # HACK: I know... I am sorry for that. It's for a good cause
    # FIXME: use redis
    global INSTITUTION_CACHE
    if text not in INSTITUTION_CACHE:
        INSTITUTION_CACHE[text] = intbitset(perform_request_search(cc='Institutions', p='110__u:"%s"' % text)) or \
            intbitset(perform_request_search(cc='Institutions', p='110__t:"%s"' % text))
    return INSTITUTION_CACHE[text]
Ejemplo n.º 52
0
    def calculate_reclist(self):
        """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection."""
        if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"):
            # do we have to recalculate?
            return (self.reclist, self.reclist_with_nonpublic_subcolls)
        write_message("... calculating reclist of %s" % self.name, verbose=6)
        reclist = intbitset() # will hold results for public sons only; good for storing into DB
        reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total
                                                   # number of documents
        if not self.dbquery:
            # A - collection does not have dbquery, so query recursively all its sons
            #     that are either non-restricted or that have the same restriction rules
            for coll in self.get_sons():
                coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
                if ((coll.restricted_p() is None) or
                    (coll.restricted_p() == self.restricted_p())):
                    # add this reclist ``for real'' only if it is public
                    reclist.union_update(coll_reclist)
                reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
        elif self.dbquery and self.get_sons():
            # A - collection does not have dbquery, so query recursively all its sons
            #     that are either non-restricted or that have the same restriction rules
            for coll in self.get_sons():
                coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist()
                if ((coll.restricted_p() is None) or
                    (coll.restricted_p() == self.restricted_p())):
                    # add this reclist ``for real'' only if it is publicf
                    reclist.union_update(coll_reclist)
                reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)

            # B - collection does have dbquery, so compute it:
            #     (note: explicitly remove DELETED records)
            reclist_self = None
            if CFG_CERN_SITE:
                reclist_self = search_pattern_parenthesised(None, self.dbquery + \
                                         ' -980__:"DELETED" -980__:"DUMMY"')
            else:
                reclist_self = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"')
            reclist.union_update(reclist_self)
            self_reclist_with_nonpublic_subcolls = copy.deepcopy(reclist_self)
            reclist_with_nonpublic_subcolls.union_update(self_reclist_with_nonpublic_subcolls)
        else:
            # B - collection does have dbquery, so compute it:
            #     (note: explicitly remove DELETED records)
            if CFG_CERN_SITE:
                reclist = search_pattern_parenthesised(None, self.dbquery + \
                                         ' -980__:"DELETED" -980__:"DUMMY"')
            else:
                reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"')
            reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
        # store the results:
        self.nbrecs = len(reclist_with_nonpublic_subcolls)
        self.reclist = reclist
        self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
        # last but not least, update the speed-up flag:
        self.calculate_reclist_run_already = 1
        # return the two sets:
        return (self.reclist, self.reclist_with_nonpublic_subcolls)
 def test_record_sorter(self):
     """bibrank record sorter - sorting records"""
     hitset = intbitset()
     hitset += (1,2,5)
     hitset2 = intbitset()
     hitset2.add(5)
     rec_termcount = {1: 1, 2: 1, 5: 1}
     (res1, res2) = bibrank_word_searcher.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0)
     self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2)))
Ejemplo n.º 54
0
 def test_set_clear(self):
     """intbitset - clearing"""
     for set1 in self.sets + [[]]:
         intbitset1 = intbitset(set1)
         intbitset1.clear()
         self.assertEqual(list(intbitset1), [])
         intbitset1 = intbitset(set1, trailing_bits=True)
         intbitset1.clear()
         self.assertEqual(list(intbitset1), [])
Ejemplo n.º 55
0
 def test_set_clear(self):
     """intbitset - clearing"""
     for set1 in self.sets + [[]]:
         intbitset1 = intbitset(set1)
         intbitset1.clear()
         self.assertEqual(list(intbitset1), [])
         intbitset1 = intbitset(set1, trailing_bits=True)
         intbitset1.clear()
         self.assertEqual(list(intbitset1), [])
Ejemplo n.º 56
0
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose):
    """Sort records by number of citations"""
    if related_to:
        from invenio.search_engine import search_pattern
        hits = intbitset()
        for pattern in related_to:
            hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern))
    else:
        hits = hitset
    return rank_by_citations(hits, verbose)
Ejemplo n.º 57
0
 def test_marshalling(self):
     """intbitset - marshalling"""
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1), intbitset().fastload((intbitset(set1).fastdump())))
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1, trailing_bits=True), intbitset().fastload(intbitset(set1, trailing_bits=True).fastdump()))
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1), intbitset().fastload((intbitset(set1).fastdump())))
     for set1 in self.sets + [[]]:
         self.assertEqual(intbitset(set1, trailing_bits=True), intbitset().fastload(intbitset(set1, trailing_bits=True).fastdump()))
def get_citedby_hitset(ahitset):
    """
    Return a hitset of records that are cited by records in the given
    ahitset.  Useful for search engine's citedby:author:ellis feature.
    """
    cache_cited_by_dictionary = get_citation_dict("reversedict")
    out = intbitset()
    if ahitset:
        for recid in ahitset:
            out = out | intbitset(cache_cited_by_dictionary.get(recid, []))
    return out
Ejemplo n.º 59
0
def print_rec_ids(rec_ids):
   complete_paper_list = intbitset(perform_request_search(p='year:2009->2010'))

   print "Rec ID, Clicks, Citations:"

   for key in rec_ids:

      paper_citation_list = intbitset(get_cited_by(key))

      narrowed_citation_count = len(paper_citation_list & complete_paper_list)
      print "%d %d %d" % (key, rec_ids[key], narrowed_citation_count)