def __call__(self): self.catalog = api.portal.get_tool('portal_catalog') self.request.response.setHeader('Content-type', 'application/json') query = {} for name in _valid_params: if self.request.form.get(name): query[name] = self.request.form[name] elif self.request.form.get(name + '[]'): query[name] = self.request.form[name + '[]'] try: page_size = int(self.request.form.get('pageSize')) except: page_size = 20 page_size = min(page_size, 50) try: page = int(self.request.form.get('page')) except: page = 1 catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if es.enabled: return self.get_es_results(page, page_size, query) else: return self.get_results(page, page_size, query)
def catalog_object(self, object, uid=None, idxs=[], update_metadata=1, pghandler=None): es = ElasticSearchCatalog(self) return es.catalog_object(object, uid, idxs, update_metadata, pghandler)
def get_popularity(site): setSite(site) catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if not es.enabled: return service = analytics.get_ga_service() if not service: return profile = analytics.get_ga_profile(service) if not profile: return bulk_data = [] bulk_size = es.get_setting('bulk_size', 50) conn = es.connection site._p_jar.sync() for path, page_views in get_results(service, profile)['rows']: path = path.split('?')[0].lstrip('/').replace('/view', '').split('@@')[0] ob = site.restrictedTraverse(str(path), None) if ob is None: continue annotations = IAnnotations(ob) data = {'page_views': int(page_views)} counts = annotations.get(COUNT_ANNOTATION_KEY, OOBTree()) counts['page_views'] = int(page_views) annotations[COUNT_ANNOTATION_KEY] = counts for key, value in counts.items(): if key in ('page_views', ): continue data[key + '_shares'] = value if IPloneSiteRoot.providedBy(ob): ob = ob[get_default_page(ob)] bulk_data.extend([{ 'update': { '_index': es.index_name, '_id': IUUID(ob) } }, { 'doc': data }]) if len(bulk_data) % bulk_size == 0: conn.bulk(index=es.index_name, body=bulk_data) bulk_data = [] transaction.commit() site._p_jar.sync() if len(bulk_data) > 0: conn.bulk(index=es.index_name, body=bulk_data) transaction.commit()
def _make_query(self, query): portal_catalog = api.portal.get_tool('portal_catalog') try: es = ElasticSearchCatalog(portal_catalog) return es.connection.search( index=es.index_name, doc_type=es.doc_type, body=query)['aggregations']['totals']['buckets'] except TransportError: return []
def elasticsearch(self): catalog = getToolByName(self.context, 'portal_catalog') es = ElasticSearchCatalog(catalog) conn = es.connection try: (conn.cluster.health()) return True, 'ok' except elasticsearch.ConnectionError as e: return False, str(e)
def _es_update(self): registry = getUtility(IRegistry) settings = registry.forInterface(IElasticSettings) settings.enabled = True settings.sniffer_timeout = 1.0 self.catalog = getToolByName(self.portal, 'portal_catalog') self.catalog._elasticcustomindex = 'plone-test-index' self.es = ElasticSearchCatalog(self.catalog) self.es.recreateCatalog() self.catalog.manage_catalogRebuild()
def doimport(args): start_time = datetime.now() if not os.path.exists(args.filepath): logger.critical("does not exist: {}".format(args.filepath)) sys.exit(1) try: catalog = api.portal.get_tool('portal_catalog') es_catalog = ElasticSearchCatalog(catalog) except Exception: logger.critical('Error setting up ElasticSearchCatalog') sys.exit(1) if not es_catalog.enabled: logger.critical('Elasticsearch not enabled on site `{}`'.format(args.site_id)) return es_custom_index_name_enabled = api.portal.get_registry_record( 'castle.es_index_enabled', default=False) custom_index_value = api.portal.get_registry_record('castle.es_index', default=None) index_name = audit.get_index_name( site_path=None, es_custom_index_name_enabled=es_custom_index_name_enabled, custom_index_value=custom_index_value) logger.info('importing audit log into ES index `{}`'.format(index_name)) es = ESConnectionFactoryFactory()() if not es.indices.exists(index_name): logger.info('creating index...') try: audit._create_index(es, index_name) except Exception: logging.critical('could not create index `{}`'.format(index_name), exc_info=True) sys.exit(1) num = 0 bulkdata = [] for log in get_log_data(args.filepath): bulkdata.append({ "_index": index_name, "_source": log, }) num += 1 if num % 10000 == 0: logger.info("at {}, performing bulk operation...".format(num)) bulkupdate(es, bulkdata, index_name) bulkdata = [] logger.info("at {}, performing final bulk operation...".format(num)) bulkupdate(es, bulkdata, index_name) end_time = datetime.now() elapsed_time = end_time - start_time logger.info('{} entries indexed in {}'.format(num, elapsed_time))
def convert(self): if self.request.method == 'POST': authenticator = getMultiAdapter((self.context, self.request), name=u'authenticator') if not authenticator.verify(): raise Unauthorized es = ElasticSearchCatalog(self.context) es.convertToElastic() site = aq_parent(self.context) self.request.response.redirect('%s/@@elastic-controlpanel' % (site.absolute_url()))
def __call__(self): self.catalog = api.portal.get_tool('portal_catalog') self.request.response.setHeader('Content-type', 'application/json') query = {} for name in _valid_params: real_name = name if real_name.endswith(':list'): real_name = real_name[:-len(':list')] if self.request.form.get(name): query[real_name] = self.request.form[name] elif self.request.form.get(name + '[]'): query[real_name] = self.request.form[name + '[]'] if query.get('after'): if query.get('sort_on') not in ('effective', 'modified', 'created'): sort_on = query['sort_on'] = 'effective' else: sort_on = query['sort_on'] try: date = dateutil.parser.parse(query.pop('after')) start = DateTime(date) query[sort_on] = {'query': start, 'range': 'min'} except (KeyError, AttributeError, ValueError, TypeError): pass query['review_state'] = 'published' registry = getUtility(IRegistry) if not registry.get('plone.allow_public_in_private_container', False): query['has_private_parents'] = False query['exclude_from_search'] = False try: page_size = int(self.request.form.get('pageSize')) except Exception: page_size = 20 page_size = min(page_size, 50) try: page = int(self.request.form.get('page')) except Exception: page = 1 catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if es.enabled: return self.get_es_results(page, page_size, query) else: return self.get_results(page, page_size, query)
def setUp(self): self.portal = self.layer['portal'] self.request = self.layer['request'] login(self.portal, TEST_USER_NAME) setRoles(self.portal, TEST_USER_ID, ('Member', 'Manager')) registry = getUtility(IRegistry) settings = registry.forInterface(IElasticSettings) settings.enabled = True settings.sniffer_timeout = None catalog = getToolByName(self.portal, 'portal_catalog') catalog._elasticcustomindex = 'plone-test-index' es = ElasticSearchCatalog(catalog) es.recreateCatalog() catalog.manage_catalogRebuild() transaction.commit() # have to do commit for es integration... folder = api.content.create( type='Folder', id='esfolder1', container=self.portal, title='Foobar folder') api.content.create( type='Document', id='esdoc1', container=folder, title='Foobar one') doc = api.content.create( type='Document', id='esdoc2', container=folder, subject=('foobar',), title='Foobar two') api.content.create( type='Document', id='esdoc3', container=folder, title='Foobar three') ann = IAnnotations(doc) ann[COUNT_ANNOTATION_KEY] = { 'twitter_matomo': 5, 'facebook': 5, } doc.reindexObject() transaction.commit() url = 'http://{}:9200/plone-test-index/_flush'.format(host) requests.post(url)
def moveObjectsByDelta(self, ids, delta, subset_ids=None, suppress_events=False): res = self._old_moveObjectsByDelta(ids, delta, subset_ids=subset_ids, suppress_events=suppress_events) es = ElasticSearchCatalog(api.portal.get_tool('portal_catalog')) if es.enabled: if subset_ids is None: subset_ids = self.idsInOrder() hook.index_positions(self.context, subset_ids) return res
def search_es(self, query, start, size): user = _getAuthenticatedUser(self.catalog) query['allowedRolesAndUsers'] = self.catalog._listAllowedRolesAndUsers( user) es = ElasticSearchCatalog(self.catalog) qassembler = getMultiAdapter((self.request, es), IQueryAssembler) dquery, sort = qassembler.normalize(query) equery = qassembler(dquery) doc_type = es.doc_type if 'searchSite' in self.request.form: doc_type = CRAWLED_SITE_ES_DOC_TYPE equery = { 'filtered': { 'filter': { "term": { "domain": self.request.form['searchSite'] } }, 'query': equery['function_score']['query']['filtered']['query'] } } query = { 'query': equery, "suggest": { "SearchableText": { "text": query.get('SearchableText', ''), "term": { "field": "SearchableText" } } }, 'sort': sort } query_params = { 'from_': start, 'size': size, 'fields': ','.join(_search_attributes) + ',path.path' } return es.connection.search(index=es.index_name, doc_type=doc_type, body=query, **query_params)
def PloneSite_moveObjectsByDelta(self, ids, delta, subset_ids=None, suppress_events=False): res = self._old_moveObjectsByDelta(ids, delta, subset_ids=subset_ids, suppress_events=suppress_events) es = ElasticSearchCatalog(api.portal.get_tool('portal_catalog')) if es.enabled: if subset_ids is None: objects = list(self._objects) subset_ids = self.getIdsSubset(objects) hook.index_positions(self, subset_ids) return res
def search_es(self, query, start, size): user = _getAuthenticatedUser(self.catalog) query['allowedRolesAndUsers'] = self.catalog._listAllowedRolesAndUsers( user) es = ElasticSearchCatalog(self.catalog) qassembler = getMultiAdapter((self.request, es), IQueryAssembler) dquery, sort = qassembler.normalize(query) equery = qassembler(dquery) index_name = es.index_name if 'searchSite' in self.request.form: index_name = '{index_name}_crawler'.format( index_name=es.index_name) # get rid of allowedRolesAndUsers,trashed,popularity script,etc (n/a for public crawl) equery = equery['script_score']['query'] equery['bool']['filter'] = [{ 'term': { 'domain': self.request.form['searchSite'] } }] query = { 'query': equery, "suggest": { "SearchableText": { "text": query.get('SearchableText', ''), "term": { "field": "SearchableText" } } }, 'sort': sort } query_params = { 'stored_fields': ','.join(_search_attributes), 'from_': start, 'size': size, } return es.connection.search(index=index_name, body=query, **query_params)
def getHook(es=None): if es is None: from collective.elasticsearch.es import ElasticSearchCatalog es = ElasticSearchCatalog(api.portal.get_tool('portal_catalog')) if not es.enabled: return trns = transaction.get() hook = None for _hook in trns._after_commit: if isinstance(_hook[0], CommitHook): hook = _hook[0] break if hook is None: hook = CommitHook(es) trns.addAfterCommitHook(hook) return hook
def test_crawl_page(self): responses.add(responses.GET, "https://www.foobar.com", body=TEST_ARCHIVE_PAGE, content_type="text/html") catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') crawler = Crawler(self.portal, settings, es) data = crawler.crawl_page('https://www.foobar.com') self.assertEquals(data['domain'], 'www.foobar.com') self.assertEquals(data['url'], 'https://www.foobar.com') self.assertEquals(data['portal_type'], 'Form Folder') self.assertTrue(bool(data['Title'])) self.assertTrue(bool(data['SearchableText']))
def setUp(self): super(BaseTest, self).setUp() self.portal = self.layer['portal'] self.request = self.layer['request'] self.request.environ['testing'] = True self.app = self.layer['app'] registry = getUtility(IRegistry) settings = registry.forInterface(IElasticSettings, check=False) settings.enabled = True self.catalog = getToolByName(self.portal, 'portal_catalog') self.catalog._elasticcustomindex = 'plone-test-index' self.es = ElasticSearchCatalog(self.catalog) self.es.convertToElastic() self.catalog.manage_catalogRebuild() # need to commit here so all tests start with a baseline # of elastic enabled self.commit()
def get_totals(self): query = { "size": 0, "aggregations": { "totals": { "terms": { "field": "portal_type" } } } } portal_catalog = api.portal.get_tool('portal_catalog') try: es = ElasticSearchCatalog(portal_catalog) result = es.connection.search(index=es.index_name, doc_type=es.doc_type, body=query) except TransportError: return [] return result['aggregations']['totals']['buckets']
def crawl_site(site, full=False): registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') if not settings.crawler_active or not settings.crawler_site_maps: logger.info("Crawler must first be enabled in Site Setup") return False catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) index_name = '{site_index_name}_crawler'.format( site_index_name=es.index_name) if not es.enabled: logger.info( "Elasticsearch must be enabled in Site Setup to use crawler") return False # check index type is mapped, create if not try: es.connection.indices.get_mapping(index=index_name) except NotFoundError: # need to add it adapter = getMultiAdapter((getRequest(), es), IMappingProvider) mapping = adapter() mapping['properties'].update(CRAWLER_ES_MAPPING) if not es.connection.indices.exists(index_name): es.connection.indices.create(index_name) es.connection.indices.put_mapping(body=mapping, index=index_name) crawler = Crawler(site, settings, es) if settings.crawler_index_archive: crawler.crawl_archives() for sitemap in settings.crawler_site_maps: try: crawler.crawl_site_map(sitemap, full) except Exception: logger.error('Error crawling site map: %s' % sitemap, exc_info=True) return True
def get_index_summary(self): query = { "size": 0, "aggregations": { "totals": { "terms": { "field": "domain" } } } } portal_catalog = api.portal.get_tool('portal_catalog') try: es = ElasticSearchCatalog(portal_catalog) result = es.connection.search(index=es.index_name, doc_type=CRAWLED_SITE_ES_DOC_TYPE, body=query) except TransportError: return [] data = result['aggregations']['totals']['buckets'] return data
def get_index_summary(self): query = { "size": 0, "aggregations": { "totals": { "terms": { "field": "domain" } } } } portal_catalog = api.portal.get_tool('portal_catalog') try: es = ElasticSearchCatalog(portal_catalog) result = es.connection.search( index='{index_name}_crawler'.format(index_name=es.index_name), body=query) except TransportError: return [] data = result['aggregations']['totals']['buckets'] return data
def crawl_site(site, full=False): registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') if not settings.crawler_active or not settings.crawler_site_maps: return False catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if not es.enabled: return False # check index type is mapped, create if not try: es.connection.indices.get_mapping(index=es.index_name, doc_type=CRAWLED_SITE_ES_DOC_TYPE) except NotFoundError: # need to add it adapter = getMultiAdapter((getRequest(), es), IMappingProvider) mapping = adapter() mapping['properties'].update(CRAWLER_ES_MAPPING) es.connection.indices.put_mapping(doc_type=CRAWLED_SITE_ES_DOC_TYPE, body=mapping, index=es.index_name) crawler = Crawler(site, settings, es) if settings.crawler_index_archive: crawler.crawl_archives() for sitemap in settings.crawler_site_maps: try: crawler.crawl_site_map(sitemap, full) except: logger.error('Error crawling site map: %s' % sitemap, exc_info=True) return True
parsed = urlparse(url) # parsed url includes bucket so we strip off bucket to get actual key return '/'.join(parsed.path.split('/')[2:]) if __name__ == '__main__': login_as_admin(app) # noqa site = app[args.site_id] # noqa setSite(site) toremove = {} # uid: path catalog = api.portal.get_tool('portal_catalog') registry = getUtility(IRegistry) crawler_settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') es = ElasticSearchCatalog(catalog) crawler = Crawler(site, crawler_settings, es) storage = archival.Storage(site) for key, archive_data in storage.archives.items(): for url in (archive_data.get('view_url'), archive_data['url']): if not url: continue resp = requests.get(url) if 'html' not in resp.headers.get('content-type'): continue print('processing ' + url) dom = fromstring(resp.content) prop = dom.cssselect('meta[property="og:url"]') fix_urls(storage, dom) html = tostring(dom)
def index_in_es(obj): catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if es.enabled: index_batch([], {IUUID(obj): obj}, [], es)
def index_site(site): setup_site(site) catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if not es.enabled: return req = getRequest() assert req is not None alsoProvides(req, IReindexActive) # first we want to get all document ids from elastic page_size = 700 ids = [] result = es.connection.search( index=es.index_name, doc_type=es.doc_type, scroll='30s', size=page_size, fields=[], body={ "query": { "match_all": {} } }) ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] while scroll_id: result = es.connection.scroll( scroll_id=scroll_id, scroll='30s' ) if len(result['hits']['hits']) == 0: break ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] index = {} count = 0 for brain in catalog(): count += 1 # go through each object and reindex using bulk setting try: ob = brain.getObject() except Exception: print('Could not get object of %s' % brain.getPath()) continue try: uid = IUUID(ob) index[uid] = ob except TypeError: print('Could not get UID of %s' % brain.getPath()) continue if uid in ids: # remove from uids... When all said and done, # we'll make sure the uids left are in fact no longer on the # system and remove them from es ids.remove(uid) if len(index) > 300: print('finished indexing %i' % count) index_batch([], index, [], es) site._p_jar.invalidateCache() # noqa transaction.begin() site._p_jar.sync() # noqa index = {} index_batch([], index, [], es) remove = [] for uid in ids: brains = catalog(UID=uid) if len(brains) == 0: remove.append(uid) index_batch(remove, {}, [], es)
def manage_catalogClear(self, *args, **kwargs): """ need to be publishable """ es = ElasticSearchCatalog(self) return es.manage_catalogClear(*args, **kwargs)
def safeSearchResults(self, REQUEST=None, **kw): es = ElasticSearchCatalog(self) return es.searchResults(REQUEST, check_perms=True, **kw)
def unrestrictedSearchResults(self, REQUEST=None, **kw): es = ElasticSearchCatalog(self) return es.searchResults(REQUEST, check_perms=False, **kw)
def uncatalog_object(self, uid, obj=None, *args, **kwargs): es = ElasticSearchCatalog(self) return es.uncatalog_object(uid, obj, *args, **kwargs)
def index_batch(remove, index, positions, es=None): if es is None: from collective.elasticsearch.es import ElasticSearchCatalog es = ElasticSearchCatalog(api.portal.get_tool('portal_catalog')) setSite(api.portal.get()) conn = es.connection bulk_size = es.get_setting('bulk_size', 50) if len(remove) > 0: bulk_data = [] for uid in remove: bulk_data.append({ 'delete': { '_index': es.index_name, '_type': es.doc_type, '_id': uid } }) es.connection.bulk(index=es.index_name, doc_type=es.doc_type, body=bulk_data) if len(index) > 0: if type(index) in (list, tuple, set): # does not contain objects, must be async, convert to dict index = dict([(k, None) for k in index]) bulk_data = [] for uid, obj in index.items(): if obj is None: obj = uuidToObject(uid) if obj is None: continue bulk_data.extend([{ 'index': { '_index': es.index_name, '_type': es.doc_type, '_id': uid } }, get_index_data(obj, es)]) if len(bulk_data) % bulk_size == 0: conn.bulk(index=es.index_name, doc_type=es.doc_type, body=bulk_data) bulk_data = [] if len(bulk_data) > 0: conn.bulk(index=es.index_name, doc_type=es.doc_type, body=bulk_data) if len(positions) > 0: bulk_data = [] index = getIndex(es.catalogtool._catalog, 'getObjPositionInParent') for uid, ids in positions.items(): if uid == '/': parent = getSite() else: parent = uuidToObject(uid) if parent is None: logger.warn('could not find object to index positions') continue for _id in ids: ob = parent[_id] wrapped_object = get_wrapped_object(ob, es) try: value = index.get_value(wrapped_object) except Exception: continue bulk_data.extend([{ 'update': { '_index': es.index_name, '_type': es.doc_type, '_id': IUUID(ob) } }, { 'doc': { 'getObjPositionInParent': value } }]) if len(bulk_data) % bulk_size == 0: conn.bulk(index=es.index_name, doc_type=es.doc_type, body=bulk_data) bulk_data = [] if len(bulk_data) > 0: conn.bulk(index=es.index_name, doc_type=es.doc_type, body=bulk_data)