def commit(self): try: conn = make_connection() conn.commit(wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def main(): single_action = (len(sys.argv) == 2) and (sys.argv[1] == '--single-action') conn = make_connection() try: with conn.cursor() as cur: parser = PolyParser(single_action, conn, cur) try: while not parser.is_done(): act_inc(cur) parser.parse_all() global_live = act_dec(cur) if single_action: break else: future_live = parser.cond_notify() if global_live or future_live: parser.wait() else: parser.do_notify() print("all done", file=sys.stderr) break finally: parser.close() finally: conn.close()
def contacts(): """Shows us the contact person in specified school""" conn = common.make_connection() rows = query.contacts(conn) conn.close() return render_template('contacts.html', rows=rows)
def all_school(): """Lists all schools with their mentors - even if there's no mentor in specified school""" conn = common.make_connection() rows = query.all_school(conn) conn.close() return render_template('all_school.html', rows=rows)
def applicants_and_mentors(): """Shows us applicants name, code and mentor's name""" conn = common.make_connection() rows = query.applicants_and_mentors(conn) conn.close() return render_template("applicants_and_mentors.html", rows=rows)
def applicants(): """Lists the applicants name, code and the application's date if it was greater than 2016-01-01""" conn = common.make_connection() rows = query.applicants(conn) conn.close() return render_template('applicants.html', rows=rows)
def commit(self): try: conn = make_connection() conn.commit(waitSearcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def main(): dump_header = False if (len(sys.argv) > 1) and (sys.argv[1] == '-H'): dump_header = True del sys.argv[1] conn = make_connection() try: with conn.cursor() as cur: dumper = Dumper(cur, dump_header) try: for url in sys.argv[1:]: cur.execute( """select id, checkd from field where url=%s""", (url, )) row = cur.fetchone() if not row: print(url + " not found", file=sys.stderr) else: if row[1] is None: print(url + " not downloaded", file=sys.stderr) else: dumper.dump(url, row[0]) finally: dumper.close() finally: conn.close()
def mentors_by_country(): """ Shows us the number of mentors by country""" conn = common.make_connection() rows = query.mentors_by_country(conn) conn.close() return render_template('mentors_by_country.html', rows=rows)
def mentors_and_schools(): " Lists all mentors with thair school's name and country ordered by mentor id" conn = common.make_connection() rows = query.mentors(conn) conn.close() return render_template('mentors.html', rows=rows)
def main(): conn = make_connection() try: with conn.cursor() as cur: inner_select = """select url_id from download_error""" cur.execute("""update field set checkd=null where id in (%s)""" % inner_select) cur.execute("""delete from locality where url_id in (%s)""" % inner_select) cur.execute("""delete from content where url_id in (%s)""" % inner_select) cur.execute("delete from download_error") kicker = Kicker(cur) kicker.run() seeder = Seeder(cur) seeder.seed_queue() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: retriever = Retriever(cur) retriever.retrieve_all() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: decompressor = Decompressor(cur) decompressor.decompress() finally: conn.close()
def index_package(self, pkg_dict): if pkg_dict is None: return if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')): return self.delete_package(pkg_dict) conn = make_connection() index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', {}) for (key, value) in extras.items(): if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value if 'extras' in pkg_dict: del pkg_dict['extras'] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in [('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url')]: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] if 'resources' in pkg_dict: del pkg_dict['resources'] # index relationships as <type>:<object> rel_dict = {} rel_types = list(itertools.chain(RELATIONSHIP_TYPES)) for rel in pkg_dict.get('relationships', []): _type = rel.get('type', 'rel') if (_type in pkg_dict.keys()) or (_type not in rel_types): continue rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')] pkg_dict.update(rel_dict) if 'relationships' in pkg_dict: del pkg_dict['relationships'] pkg_dict[TYPE_FIELD] = PACKAGE_TYPE pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()]) # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # send to solr: try: conn.add_many([pkg_dict]) conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def main(): conn = make_connection() try: with conn.cursor() as cur: builder = Builder(cur) builder.prepare() builder.process() finally: conn.close()
def clear_index(): conn = make_connection() query = "+site_id:\"%s\"" % (config.get('ckan.site_id')) try: conn.delete_query(query) conn.commit() except socket.error, e: log.error('Could not connect to SOLR: %r' % e) raise
def clear_index(): conn = make_connection() query = "+site_id:\"%s\"" % (config.get('ckan.site_id')) try: conn.delete(q=query) except socket.error, e: err = 'Could not connect to SOLR %r: %r' % (conn.url, e) log.error(err) raise SearchIndexError(err)
def delete_package(self, pkg_dict): conn = make_connection() query = "+%s:%s AND +(id:\"%s\" OR name:\"%s\") AND +site_id:\"%s\"" % \ (TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'), config.get('ckan.site_id')) try: commit = asbool(config.get('ckan.search.solr_commit', 'true')) conn.delete(q=query, commit=commit) except Exception as e: log.exception(e) raise SearchIndexError(e)
def main(): conn = make_connection() try: with conn.cursor() as cur: compressor = Compressor(cur) try: compressor.compress_all() finally: compressor.close() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: # maybe check here whether download and/or parse is running? it shouldn't... act_reset(cur) kicker = Kicker(cur) kicker.run() finally: conn.close()
def delete_package(self, pkg_dict): conn = make_connection() query = "+%s:%s (+id:\"%s\" OR +name:\"%s\") +site_id:\"%s\"" % ( TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'), config.get('ckan.site_id')) try: conn.delete_query(query) conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def delete_package(self, pkg_dict): conn = make_connection() query = "+%s:%s (+id:\"%s\" OR +name:\"%s\") +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'), config.get('ckan.site_id')) try: commit = asbool(config.get('ckan.search.solr_commit', 'true')) conn.delete(q=query, commit=commit) except Exception, e: log.exception(e) raise SearchIndexError(e)
def delete_package(self, pkg_dict): conn = make_connection() query = "+%s:%s +id:\"%s\" +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), config.get('ckan.site_id')) try: conn.delete_query(query) conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def clear_index(): import solr.core conn = make_connection() query = '+site_id:"%s"' % (config.get("ckan.site_id")) try: conn.delete_query(query) conn.commit() except socket.error, e: err = "Could not connect to SOLR %r: %r" % (conn.url, e) log.error(err) raise SearchIndexError(err)
def get_index(self,reference): query = { 'rows': 1, 'q': 'name:%s OR id:%s' % (reference,reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id')} conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def main(): conn = make_connection() try: with conn.cursor() as cur: builder = Builder(cur) cur.execute( """select from_id, to_id from redirect order by from_id, to_id""" ) rows = cur.fetchall() for row in rows: builder.add(*row) builder.dump() finally: conn.close()
def get_all_entity_ids(self, max_results=1000): """ Return a list of the IDs of all indexed packages. """ query = "*:*" fq = "+site_id:\"%s\" " % config.get('ckan.site_id') fq += "+state:active " conn = make_connection() try: data = conn.query(query, fq=fq, rows=max_results, fields='id') finally: conn.close() return [r.get('id') for r in data.results]
def delete_package(self, pkg_dict): conn = make_connection() query = '+%s:%s (+id:"%s" OR +name:"%s") +site_id:"%s"' % ( TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get("id"), pkg_dict.get("id"), config.get("ckan.site_id"), ) try: conn.delete_query(query) conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def get_index(self, reference): query = { 'rows': 1, 'q': 'name:%s OR id:%s' % (reference, reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id') } conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def main(): conn = make_connection() try: with conn.cursor() as cur: tracer = Tracer(cur) try: cur.execute("""select url, id from field where checkd is not null order by url""") rows = cur.fetchall() for row in rows: tracer.parse(*row) finally: tracer.close() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: squisher = Squisher(cur) cur.execute("""select url_id from nodes order by url_id""") rows = cur.fetchall() idx = 0 for row in rows: squisher.add(row[0]) idx += 1 if not (idx % 10000): print("id %d..." % (idx, )) finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: builder = Builder(cur) try: cur.execute("""select url, id from field left join nodes on id=url_id where checkd is not null and (url_id is null or depth=0) order by url""") rows = cur.fetchall() for row in rows: builder.add(*row) finally: builder.close() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: extender = Extender(cur) try: cur.execute("""select url, id from field left join extra on field.id=extra.url_id where checkd is not null and has_body is null order by url""") rows = cur.fetchall() for row in rows: extender.extend(*row) finally: extender.close() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: purger = Purger(cur) for url in sys.argv[1:]: cur.execute("""select id from field where url=%s""", (url,)) row = cur.fetchone() if not row: print(url + " not found", file=sys.stderr) else: purger.purge_fast(row[0]) purger.purge_rest() finally: conn.close()
def main(): conn = make_connection() try: with conn.cursor() as cur: bfs = BreathFirstSearch(cur) # graph.py actually doesn't fill the root (it only fills # nodes with parents) - it should be done by seed.py bfs.check_pre() depth = 0 count = 1 while count: count = bfs.step(depth) depth += 1 print("found %d nodes at depth %d" % (count, depth)) bfs.check_post() finally: conn.close()
def main(): paydirt = get_mandatory_option('paydirt_rx') conn = make_connection() try: with conn.cursor() as cur: adder = Adder(cur) cur.execute( """select url, url_id, depth from nodes join field on url_id=id where url ~ %s and depth is not null order by url""", (paydirt, )) rows = cur.fetchall() for row in rows: adder.add(*row) finally: conn.close()
def main(): single_action = (len(sys.argv) == 2) and (sys.argv[1] == '--single-action') conn = make_connection() try: with conn.cursor() as cur: retriever = Retriever(single_action, conn, cur) while True: act_inc(cur) retriever.retrieve_all() global_live = act_dec(cur) if single_action: break else: future_live = retriever.cond_notify() if global_live or future_live or retriever.has_holds(): retriever.wait() else: retriever.last_notify() print("all done", file=sys.stderr) break finally: conn.close()
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. if ':' not in query['q']: query['defType'] = 'dismax' query['tie'] = '0.1' query['mm'] = '1' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def index_package(self, pkg_dict): if pkg_dict is None: return # add to string field for sorting title = pkg_dict.get("title") if title: pkg_dict["title_string"] = title if (not pkg_dict.get("state")) or ("active" not in pkg_dict.get("state")): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get("extras", []) for extra in extras: key, value = extra["key"], json.loads(extra["value"]) if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = "".join([c for c in key if c in KEY_CHARS]) pkg_dict["extras_" + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop("extras", None) # Add tags and groups tags = pkg_dict.pop("tags", []) pkg_dict["tags"] = [tag["name"] for tag in tags] groups = pkg_dict.pop("groups", []) # Capacity is different to the default only if using organizations # where the dataset is only in one group. We will add the capacity # from the single group that it is a part of if we have a group if len(groups): pkg_dict["capacity"] = groups[0].get("capacity", "public") else: pkg_dict["capacity"] = "public" pkg_dict["groups"] = [group["name"] for group in groups] # tracking tracking_summary = pkg_dict.pop("tracking_summary", None) if tracking_summary: pkg_dict["views_total"] = tracking_summary["total"] pkg_dict["views_recent"] = tracking_summary["recent"] # flatten the structure for indexing: for resource in pkg_dict.get("resources", []): for (okey, nkey) in [("description", "res_description"), ("format", "res_format"), ("url", "res_url")]: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u"")] pkg_dict.pop("resources", None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel["type"]) rel_dict[type].append(model.Package.get(rel["subject_package_id"]).name) for rel in subjects: type = rel["type"] rel_dict[type].append(model.Package.get(rel["object_package_id"]).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE pkg_dict = dict([(k.encode("ascii", "ignore"), v) for (k, v) in pkg_dict.items()]) for k in ("title", "notes", "title_string"): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict["metadata_created"] += "Z" pkg_dict["metadata_modified"] += "Z" # mark this CKAN instance as data source: pkg_dict["site_id"] = config.get("ckan.site_id") # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ["title"]: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict["index_id"] = hashlib.md5("%s%s" % (pkg_dict["id"], config.get("ckan.site_id"))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, "Plugin must return non empty package dict on index" # send to solr: try: conn = make_connection() conn.add_many([pkg_dict]) conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def index_package(self, pkg_dict): if pkg_dict is None: return if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')): return self.delete_package(pkg_dict) conn = make_connection() index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', {}) for (key, value) in extras.items(): if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value if 'extras' in pkg_dict: del pkg_dict['extras'] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in [('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url'), ('name', 'res_name')]: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] if 'resources' in pkg_dict: del pkg_dict['resources'] # index relationships as <type>:<object> rel_dict = {} rel_types = list(itertools.chain(RELATIONSHIP_TYPES)) for rel in pkg_dict.get('relationships', []): _type = rel.get('type', 'rel') if (_type in pkg_dict.keys()) or (_type not in rel_types): continue rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')] pkg_dict.update(rel_dict) if 'relationships' in pkg_dict: del pkg_dict['relationships'] pkg_dict[TYPE_FIELD] = PACKAGE_TYPE pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # send to solr: try: conn.add_many([pkg_dict]) conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' from solr import SolrException assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results query['rows'] = min(1000, int(query.get('rows', 10))) # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # query field weighting: disabled for now as solr 3.* is required for # the 'edismax' query parser, our current Ubuntu version only has # packages for 1.4 # # query['defType'] = 'edismax' # query['tie'] = '0.5' # query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def run(self, query): """ Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. """ from solr import SolrException assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get("q") if not q or q == '""' or q == "''": query["q"] = "*:*" # number of results rows_to_return = min(1000, int(query.get("rows", 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query["rows"] = rows_to_query # order by score if no 'sort' term given order_by = query.get("sort") if order_by == "rank" or order_by is None: query["sort"] = "score desc, name asc" # show only results from this CKAN instance fq = query.get("fq", "") if not "+site_id:" in fq: fq += ' +site_id:"%s"' % config.get("ckan.site_id") # filter for package status if not "+state:" in fq: fq += " +state:active" query["fq"] = fq # faceting query["facet"] = query.get("facet", "true") query["facet.limit"] = query.get("facet.limit", config.get("search.facets.limit", "50")) query["facet.mincount"] = query.get("facet.mincount", 1) # return the package ID and search scores query["fl"] = query.get("fl", "name") # return results as json encoded string query["wt"] = query.get("wt", "json") # If the query has a colon in it then consider it a fielded search and do use dismax. if ":" not in query["q"]: query["defType"] = "dismax" query["tie"] = "0.1" query["mm"] = "1" query["qf"] = query.get("qf", QUERY_FIELDS) conn = make_connection() log.debug("Package query: %r" % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
def index_package(self, pkg_dict): if pkg_dict is None: return pkg_dict['data_dict'] = json.dumps(pkg_dict) # add to string field for sorting title = pkg_dict.get('title') if title: pkg_dict['title_string'] = title if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', []) for extra in extras: key, value = extra['key'], json.loads(extra['value']) if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop('extras', None) #Add tags and groups tags = pkg_dict.pop('tags', []) pkg_dict['tags'] = [tag['name'] for tag in tags] groups = pkg_dict.pop('groups', []) # Capacity is different to the default only if using organizations # where the dataset is only in one group. We will add the capacity # from the single group that it is a part of if we have a group if len(groups): pkg_dict['capacity'] = groups[0].get('capacity', 'public') else: pkg_dict['capacity'] = 'public' pkg_dict['groups'] = [group['name'] for group in groups] # tracking tracking_summary = pkg_dict.pop('tracking_summary', None) if tracking_summary: pkg_dict['views_total'] = tracking_summary['total'] pkg_dict['views_recent'] = tracking_summary['recent'] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in [('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url')]: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] pkg_dict.pop('resources', None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel['type']) rel_dict[type].append(model.Package.get(rel['subject_package_id']).name) for rel in subjects: type = rel['type'] rel_dict[type].append(model.Package.get(rel['object_package_id']).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()]) for k in ('title', 'notes', 'title_string'): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title']: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # send to solr: try: conn = make_connection() conn.add_many([pkg_dict]) conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return data_dict_json = json.dumps(pkg_dict) if config.get("ckan.cache_validated_datasets", True): package_plugin = lib_plugins.lookup_package_plugin(pkg_dict.get("type")) schema = package_plugin.show_package_schema() validated_pkg_dict, errors = lib_plugins.plugin_validate( package_plugin, {"model": model, "session": model.Session}, pkg_dict, schema, "package_show" ) pkg_dict["validated_data_dict"] = json.dumps( validated_pkg_dict, cls=ckan.lib.navl.dictization_functions.MissingNullEncoder ) pkg_dict["data_dict"] = data_dict_json # add to string field for sorting title = pkg_dict.get("title") if title: pkg_dict["title_string"] = title if (not pkg_dict.get("state")) or ("active" not in pkg_dict.get("state")): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get("extras", []) for extra in extras: key, value = extra["key"], extra["value"] if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = "".join([c for c in key if c in KEY_CHARS]) pkg_dict["extras_" + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop("extras", None) # add tags, removing vocab tags from 'tags' list and adding them as # vocab_<tag name> so that they can be used in facets non_vocab_tag_names = [] tags = pkg_dict.pop("tags", []) context = {"model": model} for tag in tags: if tag.get("vocabulary_id"): data = {"id": tag["vocabulary_id"]} vocab = logic.get_action("vocabulary_show")(context, data) key = u"vocab_%s" % vocab["name"] if key in pkg_dict: pkg_dict[key].append(tag["name"]) else: pkg_dict[key] = [tag["name"]] else: non_vocab_tag_names.append(tag["name"]) pkg_dict["tags"] = non_vocab_tag_names # add groups groups = pkg_dict.pop("groups", []) # we use the capacity to make things private in the search index if pkg_dict["private"]: pkg_dict["capacity"] = "private" else: pkg_dict["capacity"] = "public" pkg_dict["groups"] = [group["name"] for group in groups] # if there is an owner_org we want to add this to groups for index # purposes if pkg_dict.get("organization"): pkg_dict["organization"] = pkg_dict["organization"]["name"] else: pkg_dict["organization"] = None # tracking tracking_summary = pkg_dict.pop("tracking_summary", None) if tracking_summary: pkg_dict["views_total"] = tracking_summary["total"] pkg_dict["views_recent"] = tracking_summary["recent"] resource_fields = [ ("name", "res_name"), ("description", "res_description"), ("format", "res_format"), ("url", "res_url"), ("resource_type", "res_type"), ] resource_extras = [(e, "res_extras_" + e) for e in model.Resource.get_extra_columns()] # flatten the structure for indexing: for resource in pkg_dict.get("resources", []): for (okey, nkey) in resource_fields + resource_extras: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u"")] pkg_dict.pop("resources", None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel["type"]) rel_dict[type].append(model.Package.get(rel["subject_package_id"]).name) for rel in subjects: type = rel["type"] rel_dict[type].append(model.Package.get(rel["object_package_id"]).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE # Save dataset type pkg_dict["dataset_type"] = pkg_dict["type"] # clean the dict fixing keys and dates # FIXME where are we getting these dirty keys from? can we not just # fix them in the correct place or is this something that always will # be needed? For my data not changing the keys seems to not cause a # problem. new_dict = {} bogus_date = datetime.datetime(1, 1, 1) for key, value in pkg_dict.items(): key = key.encode("ascii", "ignore") if key.endswith("_date"): try: date = parse(value, default=bogus_date) if date != bogus_date: value = date.isoformat() + "Z" else: # The date field was empty, so dateutil filled it with # the default bogus date value = None except ValueError: continue new_dict[key] = value pkg_dict = new_dict for k in ("title", "notes", "title_string"): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict["metadata_created"] += "Z" pkg_dict["metadata_modified"] += "Z" # mark this CKAN instance as data source: pkg_dict["site_id"] = config.get("ckan.site_id") # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ["title"]: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict["index_id"] = hashlib.md5("%s%s" % (pkg_dict["id"], config.get("ckan.site_id"))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, "Plugin must return non empty package dict on index" # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get("ckan.search.solr_commit", "true")): commit = False conn.add_many([pkg_dict], _commit=commit) except solr.core.SolrException, e: msg = "Solr returned an error: {0} {1} - {2}".format( e.httpcode, e.reason, e.body[:1000] # limit huge responses ) raise SearchIndexError(msg)
def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return # tracking summary values will be stale, never store them tracking_summary = pkg_dict.pop('tracking_summary', None) for r in pkg_dict.get('resources', []): r.pop('tracking_summary', None) data_dict_json = json.dumps(pkg_dict) if config.get('ckan.cache_validated_datasets', True): package_plugin = lib_plugins.lookup_package_plugin( pkg_dict.get('type')) schema = package_plugin.show_package_schema() validated_pkg_dict, errors = lib_plugins.plugin_validate( package_plugin, {'model': model, 'session': model.Session}, pkg_dict, schema, 'package_show') pkg_dict['validated_data_dict'] = json.dumps(validated_pkg_dict, cls=ckan.lib.navl.dictization_functions.MissingNullEncoder) pkg_dict['data_dict'] = data_dict_json # add to string field for sorting title = pkg_dict.get('title') if title: pkg_dict['title_string'] = title # delete the package if there is no state, or the state is `deleted` if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', []) for extra in extras: key, value = extra['key'], extra['value'] if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop('extras', None) # add tags, removing vocab tags from 'tags' list and adding them as # vocab_<tag name> so that they can be used in facets non_vocab_tag_names = [] tags = pkg_dict.pop('tags', []) context = {'model': model} for tag in tags: if tag.get('vocabulary_id'): data = {'id': tag['vocabulary_id']} vocab = logic.get_action('vocabulary_show')(context, data) key = u'vocab_%s' % vocab['name'] if key in pkg_dict: pkg_dict[key].append(tag['name']) else: pkg_dict[key] = [tag['name']] else: non_vocab_tag_names.append(tag['name']) pkg_dict['tags'] = non_vocab_tag_names # add groups groups = pkg_dict.pop('groups', []) # we use the capacity to make things private in the search index if pkg_dict['private']: pkg_dict['capacity'] = 'private' else: pkg_dict['capacity'] = 'public' pkg_dict['groups'] = [group['name'] for group in groups] # if there is an owner_org we want to add this to groups for index # purposes if pkg_dict.get('organization'): pkg_dict['organization'] = pkg_dict['organization']['name'] else: pkg_dict['organization'] = None # tracking if not tracking_summary: tracking_summary = model.TrackingSummary.get_for_package( pkg_dict['id']) pkg_dict['views_total'] = tracking_summary['total'] pkg_dict['views_recent'] = tracking_summary['recent'] resource_fields = [('name', 'res_name'), ('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url'), ('resource_type', 'res_type')] resource_extras = [(e, 'res_extras_' + e) for e in model.Resource.get_extra_columns()] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in resource_fields + resource_extras: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] pkg_dict.pop('resources', None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel['type']) rel_dict[type].append(model.Package.get(rel['subject_package_id']).name) for rel in subjects: type = rel['type'] rel_dict[type].append(model.Package.get(rel['object_package_id']).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE # Save dataset type pkg_dict['dataset_type'] = pkg_dict['type'] # clean the dict fixing keys and dates # FIXME where are we getting these dirty keys from? can we not just # fix them in the correct place or is this something that always will # be needed? For my data not changing the keys seems to not cause a # problem. new_dict = {} bogus_date = datetime.datetime(1, 1, 1) for key, value in pkg_dict.items(): key = key.encode('ascii', 'ignore') if key.endswith('_date'): try: date = parse(value, default=bogus_date) if date != bogus_date: value = date.isoformat() + 'Z' else: # The date field was empty, so dateutil filled it with # the default bogus date value = None except ValueError: continue new_dict[key] = value pkg_dict = new_dict for k in ('title', 'notes', 'title_string'): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title']: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get('ckan.search.solr_commit', 'true')): commit = False conn.add(docs=[pkg_dict], commit=commit) except pysolr.SolrError, e: msg = 'Solr returned an error: {0}'.format( e[:1000] # limit huge responses ) raise SearchIndexError(msg)
def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return pkg_dict['data_dict'] = json.dumps(pkg_dict) # add to string field for sorting title = pkg_dict.get('title') if title: pkg_dict['title_string'] = title if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', []) for extra in extras: key, value = extra['key'], extra['value'] if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop('extras', None) # add tags, removing vocab tags from 'tags' list and adding them as # vocab_<tag name> so that they can be used in facets non_vocab_tag_names = [] tags = pkg_dict.pop('tags', []) context = {'model': model} for tag in tags: if tag.get('vocabulary_id'): data = {'id': tag['vocabulary_id']} vocab = logic.get_action('vocabulary_show')(context, data) key = u'vocab_%s' % vocab['name'] if key in pkg_dict: pkg_dict[key].append(tag['name']) else: pkg_dict[key] = [tag['name']] else: non_vocab_tag_names.append(tag['name']) pkg_dict['tags'] = non_vocab_tag_names # add groups groups = pkg_dict.pop('groups', []) # we use the capacity to make things private in the search index if pkg_dict['private']: pkg_dict['capacity'] = 'private' else: pkg_dict['capacity'] = 'public' pkg_dict['groups'] = [group['name'] for group in groups] # if there is an owner_org we want to add this to groups for index # purposes if pkg_dict['owner_org']: pkg_dict['groups'].append(pkg_dict['organization']['name']) # tracking tracking_summary = pkg_dict.pop('tracking_summary', None) if tracking_summary: pkg_dict['views_total'] = tracking_summary['total'] pkg_dict['views_recent'] = tracking_summary['recent'] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in [('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url')]: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] pkg_dict.pop('resources', None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel['type']) rel_dict[type].append(model.Package.get(rel['subject_package_id']).name) for rel in subjects: type = rel['type'] rel_dict[type].append(model.Package.get(rel['object_package_id']).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE # Save dataset type pkg_dict['dataset_type'] = pkg_dict['type'] pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()]) for k in ('title', 'notes', 'title_string'): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title']: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get('ckan.search.solr_commit', 'true')): commit = False conn.add_many([pkg_dict], _commit=commit) except Exception, e: log.exception(e) raise SearchIndexError(e)
def run(self, query): assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results query['rows'] = min(1000, int(query.get('rows', 10))) # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # query field weighting: disabled for now as solr 3.* is required for # the 'edismax' query parser, our current Ubuntu version only has # packages for 1.4 # # query['defType'] = 'edismax' # query['tie'] = '0.5' # query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() try: log.debug('Package query: %r' % query) data = json.loads(conn.raw_query(**query)) response = data['response'] self.count = response.get('numFound', 0) self.results = response.get('docs', []) # if just fetching the id or name, return a list instead of a dict if query.get('fl') in ['id', 'name']: self.results = [r.get(query.get('fl')) for r in self.results] # get facets and convert facets list to a dict self.facets = data.get('facet_counts', {}).get('facet_fields', {}) for field, values in self.facets.iteritems(): self.facets[field] = dict(zip(values[0::2], values[1::2])) except Exception, e: log.exception(e) raise SearchError(e)