def filter_out_based_on_date_range(recids, fromdate="", untildate="", set_spec=None): """ Filter out recids based on date range.""" if fromdate: fromdate = normalize_date(fromdate, "T00:00:00Z") else: fromdate = get_earliest_datestamp() fromdate = utc_to_localtime(fromdate) if untildate: untildate = normalize_date(untildate, "T23:59:59Z") else: untildate = get_latest_datestamp() untildate = utc_to_localtime(untildate) if set_spec is not None: ## either it has a value or it empty, thus meaning all records last_updated = get_set_last_update(set_spec) if last_updated is not None: last_updated = utc_to_localtime(last_updated) if last_updated > fromdate: fromdate = utc_to_localtime(get_earliest_datestamp()) recids = intbitset(recids) ## Let's clone :-) if fromdate and untildate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate))) elif fromdate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, ))) elif untildate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, ))) if cfg.get('CFG_OAI_FILTER_RESTRICTED_RECORDS', True): recids = recids - get_all_restricted_recids() return recids
def burmeister(filename): with file(filename, "r") as f: # File format marker txt = f.readline().strip() if txt is not "B": raise Exception("Bad file format") # The name of the context name = f.readline().strip() # Read the volume of the extent and the intent G = int(f.readline().strip()) M = int(f.readline().strip()) skip_empty(f) # Read the labels first the extent, last the intent extent = [f.readline().strip() for x in range(G)] intent = [f.readline().strip() for x in range(M)] skip_empty(f) # Load and process the object-attribute relationship, # expect that it is stored extent-wise ctx0 = [] for g in range(G): line = f.readline().rstrip() ctx0.append( bs.intbitset([m for m in range(M) if line[m] in "X"], M)) # Transpose the context ctxt = [ bs.intbitset([g for g in range(G) if m in ctx0[g]], G) for m in range(M) ] return Context(extent, intent, (ctx0, ctxt), name)
def get_citedby_hitset(ahitset, record_limit=None): """ Return a hitset of records that are cited by records in the given ahitset. Useful for search engine's citedby:author:ellis feature. The parameter 'record_limit' is the maximum number of records of 'ahitset' to consider. If it is None (the default value) all the records will be used. """ out = intbitset() if ahitset: try: iter(ahitset) except OverflowError: # ignore attempt to iterate over infinite ahitset pass else: # We don't want to overwrite the input parameter if record_limit is not None: limited_ahitset = ahitset[:record_limit] else: limited_ahitset = ahitset in_sql = ','.join('%s' for dummy in limited_ahitset) rows = run_sql("""SELECT citee FROM rnkCITATIONDICT WHERE citer IN (%s)""" % in_sql, limited_ahitset) out = intbitset(rows) return out
def query_records(params): """Produce record IDs from given query parameters. By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def query_records(params): """Produce record IDs from given query parameters. By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def get_normalized_ranking_scores(response, hitset_filter=None, recids=[]): """ Returns the result having normalized ranking scores, interval [0, 100]. hitset_filter - optional filter for the results recids - optional recids that shall remain in the result despite the filter """ if not len(response.results): return ([], intbitset()) # response.maxScore does not work in case of something was added to the response max_score = float(response.results[0]['score']) ranked_result = [] matched_recs = intbitset() for hit in response.results: recid = int(hit['id']) if (not hitset_filter and hitset_filter != [] ) or recid in hitset_filter or recid in recids: normalised_score = 0 if max_score > 0: normalised_score = int(100.0 / max_score * float(hit['score'])) ranked_result.append((recid, normalised_score)) matched_recs.add(recid) ranked_result.reverse() return (ranked_result, matched_recs)
def get_normalized_ranking_scores(response, hitset_filter = None, recids = []): """ Returns the result having normalized ranking scores, interval [0, 100]. hitset_filter - optional filter for the results recids - optional recids that shall remain in the result despite the filter """ if not len(response.results): return ([], intbitset()) # response.maxScore does not work in case of something was added to the response max_score = float(response.results[0]['score']) ranked_result = [] matched_recs = intbitset() for hit in response.results: recid = int(hit['id']) if (not hitset_filter and hitset_filter != []) or recid in hitset_filter or recid in recids: normalised_score = 0 if max_score > 0: normalised_score = int(100.0 / max_score * float(hit['score'])) ranked_result.append((recid, normalised_score)) matched_recs.add(recid) ranked_result.reverse() return (ranked_result, matched_recs)
def search_unit(query, f, m, wl=None): """Search for similar records.""" from invenio.legacy.search_engine import record_exists from invenio.legacy.bibrank.record_sorter import METHODS from invenio.legacy.bibrank.word_searcher import find_similar results = intbitset([]) if query: if isinstance(query, intbitset): ahitset = query else: recid = int(query) ahitset = [recid] if record_exists(recid) == 1 else [] if len(ahitset): for recid in ahitset: results |= intbitset( find_similar('jif', recid, intbitset([]), rank_limit_relevance=0, verbose=0, methods=METHODS)[0]) return results
def test_compare_sets_tids_sets_match_with_more_than_min_and_low_len(self): thresholds = Thresholds(high_len=3, low_len=1, length=4, min_high=2, small=False, min_len=2) qlow, qhigh = intbitset(), intbitset([3, 4, 6]) ilow, ihigh = intbitset([1]), intbitset([3, 4, 6]) candidate = match_set.compare_sets(qhigh, qlow, ihigh, ilow, thresholds, match_set.tids_sets_intersector, match_set.tids_set_counter) assert candidate
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx( p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append( [phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #write_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def get_recids_for_set_spec(set_spec): """ Returns the list (as intbitset) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ recids = intbitset() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search(c=[coll.strip() \ for coll in set_def['c'].split(',')], p1=set_def['p1'], f1=set_def['f1'], m1=set_def['m1'], op1=set_def['op1'], p2=set_def['p2'], f2=set_def['f2'], m2=set_def['m2'], op2=set_def['op2'], p3=set_def['p3'], f3=set_def['f3'], m3=set_def['m3'], ap=0) recids |= intbitset(new_recids) return recids
def burmeister(filename) : with file( filename, "r" ) as f: # File format marker txt = f.readline().strip(); if txt is not "B": raise Exception("Bad file format") # The name of the context name = f.readline().strip(); # Read the volume of the extent and the intent G = int(f.readline().strip()); M = int(f.readline().strip()); skip_empty( f ) # Read the labels first the extent, last the intent extent = [ f.readline().strip() for x in range( G ) ] intent = [ f.readline().strip() for x in range( M ) ] skip_empty( f ) # Load and process the object-attribute relationship, # expect that it is stored extent-wise ctx0 = [] for g in range( G ): line = f.readline().rstrip() ctx0.append(bs.intbitset( [ m for m in range( M ) if line[ m ] in "X" ], M ) ) # Transpose the context ctxt = [ bs.intbitset( [ g for g in range( G ) if m in ctx0[ g ] ], G ) for m in range( M ) ] return Context( extent, intent, ( ctx0, ctxt ), name )
def test_get_ranked_larger_hitset(self): """solrutils - ranking larger hitset""" hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89]) self.assertEqual(tuple(), self._get_ranked_result_sequence(query='Willnotfind', hitset=hitset)) hitset = intbitset.intbitset([47, 56, 55, 56, 58, 68, 85, 89]) self.assertEqual((55, 56), self._get_ranked_result_sequence(query='"higgs boson"', hitset=hitset))
def get_recids_for_set_spec(set_spec): """ Returns the list (as intbitset) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ recids = intbitset() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search( c=[coll.strip() for coll in set_def["c"].split(",")], p1=set_def["p1"], f1=set_def["f1"], m1=set_def["m1"], op1=set_def["op1"], p2=set_def["p2"], f2=set_def["f2"], m2=set_def["m2"], op2=set_def["op2"], p3=set_def["p3"], f3=set_def["f3"], m3=set_def["m3"], ap=0, ) recids |= intbitset(new_recids) return recids
def get_data_for_definition_marc(tags, recids): '''Having a list of tags and a list of recids, it returns a dictionary with the values correspondig to the tags''' #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x] #user: 140s, sys: 21s, total: 160s - cdsdev if isinstance(recids, (int, long)): recids = intbitset([recids, ]) # for each recid we need only one value #on which we sort, so we can stop looking for a value # as soon as we find one tag_index = 0 field_data_dict = {} while len(recids) > 0 and tag_index < len(tags): write_message('%s records queried for values for tags %s.' \ %(len(recids), tags), verbose=5) res = _get_values_from_marc_tag(tags[tag_index], recids) res_dict = dict(res) #field_data_dict.update(res_dict) #we can not use this, because res_dict might contain recids #that are already in field_data_dict, and we should not overwrite their value field_data_dict = dict(res_dict, **field_data_dict) #there might be keys that we do not want (ex: using 'between') #so we should remove them res_dict_keys = intbitset(res_dict.keys()) recids_not_needed = res_dict_keys.difference(recids) for recid in recids_not_needed: del field_data_dict[recid] #update the recids to contain only the recid that do not have values yet recids.difference_update(res_dict_keys) tag_index += 1 return field_data_dict
def test_calc_mean_onbit_density(self): bitsets = {"a": intbitset([1, 2, 3]), "b": intbitset([1, 2, 4, 5, 8]), "c": intbitset([1, 2, 4, 8])} result = modifiedtanimoto.calc_mean_onbit_density(bitsets.values(), self.number_of_bits) expected = 0.04 eq_(result, expected)
def remove_member_from_node(G, node, member): # add in replacement edges if required mem_edges = list( set([e[1] for e in G.edges(node) if member in G.edges[e]['members']])) if len(mem_edges) > 1: for n1, n2 in itertools.combinations(mem_edges, 2): if G.has_edge(n1, n2): G[n1][n2]['members'] |= intbitset([member]) G[n1][n2]['size'] = len(G[n1][n2]['members']) else: G.add_edge(n1, n2, size=1, members=intbitset([member])) # remove member from node G.nodes[node]['members'].discard(member) G.nodes[node]['seqIDs'] = set([ sid for sid in G.nodes[node]['seqIDs'] if sid.split("_")[0] != str(member) ]) G.nodes[node]['size'] -= 1 # remove member from edges of node edges_to_remove = [] for e in G.edges(node): if member in G.edges[e]['members']: if len(G.edges[e]['members']) == 1: edges_to_remove.append(e) else: G.edges[e]['members'].discard(member) G.edges[e]['size'] = len(G.edges[e]['members']) for e in edges_to_remove: G.remove_edge(*e) return G
def index_token_sets(token_ids, len_junk, len_good): """ Return a 4-tuple of low & high tids sets, low & high tids multisets given a token_ids sequence. """ # For multisets, we use a defaultdict, rather than a Counter. This is midly # faster than a Counter for sparse sets. # this variant uses intbitset to evaluate its performance wrt to bitarray low_tids_set = intbitset(len_junk) low_tids_set_add = low_tids_set.add high_tids_set = intbitset(len_good) high_tids_set_add = high_tids_set.add low_tids_mset = defaultdict(int) high_tids_mset = defaultdict(int) for tid in token_ids: # this skips unknown token ids that are -1 as well as possible None if tid < 0: continue if tid < len_junk: low_tids_mset[tid] += 1 low_tids_set_add(tid) else: high_tids_mset[tid] += 1 high_tids_set_add(tid) # sparify for speed sparsify(low_tids_mset) sparsify(high_tids_mset) return low_tids_set, high_tids_set, low_tids_mset, high_tids_mset
def get_records_with_num_cites(numstr, allrecs = intbitset([])): """Return an intbitset of record IDs that are cited X times, X defined in numstr. Warning: numstr is string and may not be numeric! It can be 10,0->100 etc """ cache_cited_by_dictionary = get_citation_dict("citationdict") cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys") cache_cited_by_dictionary_keys_intbitset = get_citation_dict("citationdict_keys_intbitset") matches = intbitset([]) #once again, check that the parameter is a string if not (type(numstr) == type("thisisastring")): return intbitset([]) numstr = numstr.replace(" ",'') numstr = numstr.replace('"','') num = 0 #first, check if numstr is just a number singlenum = re.findall("(^\d+$)", numstr) if singlenum: num = int(singlenum[0]) if num == 0: #we return recids that are not in keys return allrecs - cache_cited_by_dictionary_keys_intbitset for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) == num: matches.add(k) return matches #try to get 1->10 or such firstsec = re.findall("(\d+)->(\d+)", numstr) if firstsec: first = 0 sec = -1 try: first = int(firstsec[0][0]) sec = int(firstsec[0][1]) except: return intbitset([]) if (first == 0): #start with those that have no cites.. matches = allrecs - cache_cited_by_dictionary_keys_intbitset if (first <= sec): for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) >= first: if len(li) <= sec: matches.add(k) return matches firstsec = re.findall("(\d+)\+", numstr) if firstsec: first = firstsec[0] for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) > int(first): matches.add(k) return matches
def tmpl_papers_box(self, req, pubs, bibauthorid_data, num_downloads, ln, add_box=True, loading=False): _ = gettext_set_language(ln) if not loading and pubs: ib_pubs = intbitset(pubs) if bibauthorid_data["cid"]: baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"]) elif bibauthorid_data["pid"] > -1: baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"]) baid_query = baid_query + " " rec_query = baid_query searchstr = create_html_link(websearch_templates.build_search_url(p=rec_query), {}, "<strong>" + "All papers (" + str(len(pubs)) + ")" + "</strong>",) line2 = searchstr if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads: line2 += " (" + _("downloaded") + " " line2 += str(num_downloads) + " " + _("times") + ")" if CFG_INSPIRE_SITE: CFG_COLLS = ['Book', 'ConferencePaper', 'Introductory', 'Lectures', 'Preprint', 'Published', 'Review', 'Thesis'] else: CFG_COLLS = ['Article', 'Book', 'Preprint', ] collsd = {} for coll in CFG_COLLS: coll_papers = list(ib_pubs & intbitset(perform_request_search(rg=0, f="collection", p=coll))) if coll_papers: collsd[coll] = coll_papers colls = collsd.keys() colls.sort(lambda x, y: cmp(len(collsd[y]), len(collsd[x]))) # sort by number of papers for coll in colls: rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll) line2 += "<br />" + create_html_link(websearch_templates.build_search_url(p=rec_query), {}, coll + " (" + str(len(collsd[coll])) + ")",) elif not pubs and not loading: line2 = _("No Papers") elif loading: line2 = self.loading_html() else: line2 = 'This is a bug and should be corrected' if not add_box: return line2 line1 = "<strong>" + _("Papers") + "</strong>" papers_box = self.tmpl_print_searchresultbox("papers", line1, line2) return papers_box
def get_records_that_can_be_displayed(permitted_restricted_collections, hitset_in_any_collection, current_coll=None, colls=None): """Return records that can be displayed.""" current_coll = current_coll or cfg['CFG_SITE_NAME'] records_that_can_be_displayed = intbitset() if colls is None: colls = [current_coll] policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper() # real & virtual current_coll_children = get_collection_allchildren(current_coll) # Add all restricted collections, that the user has access to, and are # under the current collection do not use set here, in order to maintain a # specific order: children of 'cc' (real, virtual, restricted), rest of 'c' # that are not cc's children colls_to_be_displayed = set([ coll for coll in current_coll_children if coll in colls or coll in permitted_restricted_collections ]) colls_to_be_displayed |= set( [coll for coll in colls if coll not in colls_to_be_displayed]) # Get all records in applicable collections records_that_can_be_displayed = intbitset() for coll in colls_to_be_displayed: records_that_can_be_displayed |= get_collection_reclist(coll) if policy == 'ANY': # User needs to have access to at least one collection that restricts # the records. We need this to be able to remove records that are both # in a public and restricted collection. permitted_recids = intbitset() notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection in permitted_restricted_collections: permitted_recids |= get_collection_reclist(collection) else: notpermitted_recids |= get_collection_reclist(collection) notpermitted_recids -= permitted_recids else: # User needs to have access to all collections that restrict a records. notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection not in permitted_restricted_collections: notpermitted_recids |= get_collection_reclist(collection) # Remove records that can not be seen by user records_that_can_be_displayed -= notpermitted_recids # Intersect only if there are some matched records if not hitset_in_any_collection.is_infinite(): records_that_can_be_displayed &= hitset_in_any_collection return records_that_can_be_displayed
def load_graphs(graph_files, n_cpu=1): for graph_file in graph_files: if not os.path.isfile(graph_file): print("Missing:", graph_file) raise RuntimeError("Missing graph file!") graphs = [nx.read_gml(graph_file) for graph_file in tqdm(graph_files)] isolate_names = list( itertools.chain.from_iterable( [G.graph['isolateNames'] for G in graphs])) member_count = 0 node_count = 0 id_mapping = [] for i, G in enumerate(graphs): id_mapping.append({}) # relabel nodes to be consecutive integers from 1 mapping = {} for n in G.nodes(): mapping[n] = node_count node_count += 1 G = nx.relabel_nodes(G, mapping, copy=True) # set up edge members and remove conflicts. for e in G.edges(): G[e[0]][e[1]]['members'] = intbitset([ m + member_count for m in conv_list(G[e[0]][e[1]]['members']) ]) # set up node parameters and remove conflicts. max_mem = -1 for n in G.nodes(): ncentroids = [] for sid in G.nodes[n]['centroid'].split(";"): nid = update_sid(sid, member_count) id_mapping[i][sid] = nid if "refound" not in nid: ncentroids.append(nid) G.nodes[n]['centroid'] = ncentroids new_ids = set() for sid in conv_list(G.nodes[n]['seqIDs']): nid = update_sid(sid, member_count) id_mapping[i][sid] = nid new_ids.add(nid) G.nodes[n]['seqIDs'] = new_ids G.nodes[n]['protein'] = del_dups(G.nodes[n]['protein'].replace( '*', 'J').split(";")) G.nodes[n]['dna'] = del_dups(G.nodes[n]['dna'].split(";")) G.nodes[n]['lengths'] = conv_list(G.nodes[n]['lengths']) G.nodes[n]['longCentroidID'][1] = update_sid( G.nodes[n]['longCentroidID'][1], member_count) G.nodes[n]['members'] = intbitset( [m + member_count for m in conv_list(G.nodes[n]['members'])]) max_mem = max(max_mem, max(G.nodes[n]['members'])) member_count = max_mem + 1 graphs[i] = G return graphs, isolate_names, id_mapping
def test_similarity(self): bitset1 = intbitset([1, 2, 3]) bitset2 = intbitset([1, 2, 4, 8]) result = modifiedtanimoto.similarity(bitset1, bitset2, self.number_of_bits, self.corr_st, self.corr_sto) expected = 0.5779523809525572 assert_almost_equal(result, expected)
def test_get_ranked_smaller_hitset(self): """solrutils - ranking smaller hitset""" hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89]) self.assertEqual((47, 56, 58, 68, 89, 85), self._get_ranked_result_sequence(query='higgs', hitset=hitset)) hitset = intbitset.intbitset([45, 50, 61, 74, 94]) self.assertEqual((50, 61, 74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset)) self.assertEqual((74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset, rows=3))
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset ) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, "e") for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: # write_warning(req, """<p>No match close to <em>%s</em> found in given collections. # Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def get_records_that_can_be_displayed(permitted_restricted_collections, hitset_in_any_collection, current_coll=None, colls=None): """Return records that can be displayed.""" current_coll = current_coll or cfg['CFG_SITE_NAME'] records_that_can_be_displayed = intbitset() if colls is None: colls = [current_coll] policy = cfg['CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY'].strip().upper() # real & virtual current_coll_children = get_collection_allchildren(current_coll) # Add all restricted collections, that the user has access to, and are # under the current collection do not use set here, in order to maintain a # specific order: children of 'cc' (real, virtual, restricted), rest of 'c' # that are not cc's children colls_to_be_displayed = set([ coll for coll in current_coll_children if coll in colls or coll in permitted_restricted_collections ]) colls_to_be_displayed |= set([coll for coll in colls if coll not in colls_to_be_displayed]) # Get all records in applicable collections records_that_can_be_displayed = intbitset() for coll in colls_to_be_displayed: records_that_can_be_displayed |= get_collection_reclist(coll) if policy == 'ANY': # User needs to have access to at least one collection that restricts # the records. We need this to be able to remove records that are both # in a public and restricted collection. permitted_recids = intbitset() notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection in permitted_restricted_collections: permitted_recids |= get_collection_reclist(collection) else: notpermitted_recids |= get_collection_reclist(collection) notpermitted_recids -= permitted_recids else: # User needs to have access to all collections that restrict a records. notpermitted_recids = intbitset() for collection in restricted_collection_cache.cache: if collection not in permitted_restricted_collections: notpermitted_recids |= get_collection_reclist(collection) # Remove records that can not be seen by user records_that_can_be_displayed -= notpermitted_recids # Intersect only if there are some matched records if not hitset_in_any_collection.is_infinite(): records_that_can_be_displayed &= hitset_in_any_collection return records_that_can_be_displayed
def get_nbhits_in_idxphrases(word, f): """Return number of hits for 'word' inside phrase index for field 'f'.""" model = IdxINDEX.idxPHRASEF(f or "anyfield") if model is None: return 0 hitlist = intbitset() for item in model.query.filter_by(term=word).values('hitlist'): hitlist |= intbitset(item[0]) return len(hitlist)
def get_records_for_user(qid, uid): key = get_search_results_cache_key_from_qid(qid) data = search_results_cache.get(key) if data is None: return intbitset([]) cc = search_results_cache.get(key + '::cc') return get_records_that_can_be_displayed( current_user.get('precached_permitted_restricted_collections', []), intbitset().fastload(data), cc)
def __init__(self, extent, intent, ctx, name): super(Context, self).__init__() self.intent = intent self.extent = extent self.name = name ## Prepare the context self.__ctx = ctx self.__extent = bs.intbitset(range(len(self.extent))) self.__intent = bs.intbitset(range(len(self.intent)))
def __init__(self, extent, intent, ctx, name): super(Context, self).__init__() self.intent = intent self.extent = extent self.name = name ## Prepare the context self.__ctx = ctx self.__extent = bs.intbitset( range( len( self.extent ) ) ) self.__intent = bs.intbitset( range( len( self.intent ) ) )
def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % \ config.get(config.get("rank_method", "function"), "kb_src")) kb_src = config.get(config.get("rank_method", "function"), "kb_src").strip() # Find path from configuration registry by knowledge base name. kb_src_clean = configuration.get(kb_src) with open(kb_src_clean, 'r') as kb_file: data = kb_file.readlines() for line in data: if not line[0:1] == "#": kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1] write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ") if tags == ['']: tags = "" records = [] for (recids, recide) in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: newset = intbitset() newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))] valid.intersection_update(newset) if tags: recs = filter(lambda x: x[0] in valid, recs) records = records + list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = filter(lambda x: x[0] in options["validset"], records) rnkset = {} for key, value in records: if value in kb_data: if key not in rnkset: rnkset[key] = float(kb_data[value]) else: if rnkset[key] in kb_data and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose): """Sort records by number of citations""" if related_to: from invenio.legacy.search_engine import search_pattern hits = intbitset() for pattern in related_to: hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern)) else: hits = hitset return rank_by_citations(hits, verbose)
def get_records_for_user(qid, uid): from invenio.legacy.search_engine import \ get_records_that_can_be_displayed key = get_search_results_cache_key_from_qid(qid) data = search_results_cache.get(key) if data is None: return intbitset([]) cc = search_results_cache.get(key + '::cc') return get_records_that_can_be_displayed(current_user, intbitset().fastload(data), cc)
def _add_user_hard(self, username, password): # type: (unicode, unicode) -> User points = 0 empty_buf = buffer(intbitset().fastdump()) self.db.cursor.execute( 'INSERT INTO users VALUES (NULL, ?, ?, ?, ?, ?)', [username, hash_password(password), points, empty_buf, empty_buf]) user_id = self.db.cursor.lastrowid self.commit() return User(user_id, username, points, intbitset(), intbitset())
def test_record_sorter(self): """bibrank record sorter - sorting records""" from invenio.legacy.bibrank import word_searcher as bibrank_word_searcher from intbitset import intbitset hitset = intbitset() hitset += (1,2,5) hitset2 = intbitset() hitset2.add(5) rec_termcount = {1: 1, 2: 1, 5: 1} (res1, res2) = bibrank_word_searcher.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0) self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2)))
def test_similarities_ignore_upper_triangle(self): bitsets = {"a": intbitset([1, 2, 3]), "b": intbitset([1, 2, 4, 5, 8]), "c": intbitset([1, 2, 4, 8])} iterator = modifiedtanimoto.similarities( bitsets, bitsets, self.number_of_bits, self.corr_st, self.corr_sto, 0.55, True ) result = [r for r in iterator] expected = [("a", "c", 0.5779523809525572), ("b", "c", 0.8357708333333689)] # pair a-c is below cutoff with similarity of 0.53 assert_similarities(result, expected)
def modified_requested_recids(self): """Record IDs of records that match the filters of this task. This property takes (0) `requested_ids`, (1) `filter_pattern` and if `force_run_on_unmodified_records` is enabled (2) `CheckerRecord.last_run_version_id` into consideration to figure out which recids a record-centric task should run on. :rtype: intbitset """ # Get all records that are already associated to this rule # If this is returning an empty set, you forgot to run bibindex try: associated_records = intbitset(zip( *db.session .query(CheckerRecord.rec_id) .filter( CheckerRecord.rule_name == self.name ).all() )[0]) except IndexError: associated_records = intbitset() # Store requested records that were until now unknown to this rule requested_ids = self.requested_recids for requested_id in requested_ids - associated_records: new_record = CheckerRecord(rec_id=requested_id, rule_name=self.name) db.session.add(new_record) db.session.commit() # Figure out which records have been edited since the last time we ran # this rule try: recids = zip( *db.session .query(CheckerRecord.rec_id) .outerjoin(RecordMetadata) .filter( CheckerRecord.rec_id.in_(requested_ids), CheckerRecord.rule_name == self.name, db.or_( self.force_run_on_unmodified_records, db.or_( CheckerRecord.last_run_version_id == 1, CheckerRecord.last_run_version_id < RecordMetadata.version_id, ), ) ) )[0] except IndexError: recids = set() return intbitset(recids)
def find(self, column, value): if self.inverse.has_column(column): if hasattr(value, '__call__'): result = intbitset() for k in self.inverse.keys(column): if value(k): result.union_update(self.inverse.get(column, k)) return result else: if self.inverse.has_key(column, value): return self.inverse.get(column, value) return intbitset()
def __init__(self, location=None, query_string=None, idx=None, line_threshold=4, _test_mode=False, tokenizer=query_tokenizer): """ Initialize the query from a file `location` or `query_string` string for an `idx` LicenseIndex. Break query in runs when there are at least `line_threshold` empty lines or junk-only lines. """ assert (location or query_string) and idx self.location = location self.query_string = query_string self.idx = idx self.line_threshold = line_threshold # token ids array self.tokens = [] # index of position -> line number where the pos is the list index self.line_by_pos = [] # index of known position -> number of unknown tokens after that pos # for unknowns at the start, the pos is -1 self.unknowns_by_pos = defaultdict(int) # Span of known positions followed by unknown token(s) self.unknowns_span = None # set of query position were there is a short, single letter token or digits-only token # TODO: consider using an intbitset self.shorts_and_digits_pos = set() self.query_runs = [] if _test_mode: return self.tokenize_and_build_runs(self.tokens_by_line(tokenizer=tokenizer), line_threshold=line_threshold) # sets of integers initialized after query tokenization len_junk = idx.len_junk self.high_matchables = intbitset( [p for p, t in enumerate(self.tokens) if t >= len_junk]) self.low_matchables = intbitset( [p for p, t in enumerate(self.tokens) if t < len_junk])
def get_all_recids(including_deleted=True): #6.68s on cdsdev """Returns a list of all records available in the system""" res = run_sql("SELECT id FROM bibrec") if not res: return intbitset([]) all_recs = intbitset(res) if not including_deleted: # we want to exclude deleted records if CFG_CERN_SITE: deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"') else: deleted = search_pattern(p='980__:"DELETED"') all_recs.difference_update(deleted) return all_recs
def get_all_recids(including_deleted=True):#6.68s on cdsdev """Returns a list of all records available in the system""" res = run_sql("SELECT id FROM bibrec") if not res: return intbitset([]) all_recs = intbitset(res) if not including_deleted: # we want to exclude deleted records if CFG_CERN_SITE: deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"') else: deleted = search_pattern(p='980__:"DELETED"') all_recs.difference_update(deleted) return all_recs
def without_fmt(queries, chunk_size=2000): """ List of record IDs to be reformated, not having the specified format yet @param sql: a dictionary with sql queries to pick from @return: a list of record ID without pre-created format cache """ sql = queries['missing'] recids = intbitset() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] for start in xrange(1, max_id + 1, chunk_size): end = start + chunk_size recids += intbitset(run_sql(sql, (start, end))) return recids
def test_compare_sets_tids_sets_match_with_less_than_ilow_len(self): thresholds = Thresholds(high_len=3, low_len=1, length=3, min_high=2, small=False, min_len=2) qlow, qhigh = intbitset(), intbitset([3, 4, 6]) ilow, ihigh = intbitset([1]), intbitset([3, 4, 6]) candidate = match_set.compare_sets(qhigh, qlow, ihigh, ilow, thresholds, match_set.tids_sets_intersector, match_set.tids_set_counter) assert candidate
def merge(self, index): for column in index.store.columns(): for key in index.store.keys(column): self.store.set(column, key, index.store.get(column, key)) for column in index.inverse.columns(): for key in index.inverse.keys(column): if not self.inverse.has_key(column, key): self.inverse.set(column, key, index.inverse.get(column, key)) else: me = intbitset() me.fastload(self.inverse.get(column, key)) other = intbitset() other.fastload(index.inverse.get(column, key)) me.union_update(other) self.inverse.set(column, key, me)
def get_collection_reclist(coll, recreate_cache_if_needed=True): """Return hitset of recIDs that belong to the collection 'coll'.""" from invenio.modules.search.searchext.engines.native import \ search_unit_in_idxphrases if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() if coll not in collection_reclist_cache.cache: return intbitset() if not collection_reclist_cache.cache[coll]: c_coll = Collection.query.filter_by(name=coll).first() if c_coll: collection_reclist_cache.cache[coll] = search_unit_in_idxphrases( c_coll.name, 'collection', 'e') return collection_reclist_cache.cache[coll] or intbitset()
def fill(): alldicts = {} from invenio.legacy.bibrank.tag_based_indexer import fromDB serialized_weights = cache.get('citations_weights') if serialized_weights: weights = deserialize_via_marshal(serialized_weights) else: weights = fromDB('citation') alldicts['citations_weights'] = weights # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts['citations_keys'] = intbitset(weights.keys()) # Citation counts alldicts['citations_counts'] = [t for t in iteritems(weights)] alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True) # Self-cites serialized_weights = cache.get('selfcites_weights') if serialized_weights: selfcites = deserialize_via_marshal(serialized_weights) else: selfcites = fromDB('selfcites') selfcites_weights = {} for recid, counts in alldicts['citations_counts']: selfcites_weights[recid] = counts - selfcites.get(recid, 0) alldicts['selfcites_weights'] = selfcites_weights alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']] alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True) return alldicts
def search_unit(query, f, m, wl=None): """Search in fulltext.""" from invenio.legacy.search_engine import (search_unit_in_bibwords, search_pattern) from invenio.legacy.miscutil.solrutils_bibindex_searcher import ( solr_get_bitset) from invenio.legacy.miscutil.xapianutils_bibindex_searcher import ( xapian_get_bitset) from ...utils import get_idx_indexer def fix(p): if m and (m == 'a' or m == 'r'): # phrase/regexp query if p.startswith('%') and p.endswith('%'): p = p[1:-1] # fix for partial phrase p = '"' + p + '"' return p indexers = { 'SOLR': solr_get_bitset, 'XAPIAN': xapian_get_bitset, } indexer = get_idx_indexer('fulltext') if indexer in indexers and \ current_app.config.get('CFG_{}_ENABLED'.format(indexer), False): try: indexers[indexer](fix(query), f, m) except: current_app.logger.exception("Fulltext search is broken.") return intbitset() elif m == 'a' or m == 'r': # FIXME: workaround for not having phrase index yet return search_pattern(p=query, f=f, m='w') # FIXME raise ContinueSearch(query, f, m, wl) return search_unit_in_bibwords(query, f, wl=wl)
def all_recids(self): """Get all recids that are assumed to exist by tasks of this master.""" identifier = self.fmt(master_all_recids) recids_set = self.conn.get(identifier) if recids_set is None: return None return intbitset(recids_set)