def get_related_videos(video): related_videos = [] conn = ES(['127.0.0.1:9200']) conn.default_indices = VIDEO_INDEX conn.refresh(VIDEO_INDEX) q = { "query": { "bool": { "should": [ {"term" : { "uid" : video.uid } }, {"terms" : { "category" : [video.category]}}, {"terms" : { "topic" : [video.topic]}}, {"terms" : { "language" : [video.language]}} ], "minimum_should_match" : 1 } } } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % VIDEO_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: related_videos.append(res['_source']) except Exception: pass return related_videos
def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': { 'store': 'yes', 'type': u'integer' }, u'link': { 'store': 'yes', 'type': u'string' }, u'title': { 'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string' }, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
def get_related_collections(collection, featured): related_collections = [] conn = ES(["127.0.0.1:9200"]) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q = { "query": { "bool": { "must_not": {"term": {"uid": collection.uid}}, "should": [{"terms": {"subject": [collection.subject]}}, {"terms": {"topic": [collection.topic]}}], "minimum_should_match": 1, } } } if featured: q = { "query": { "bool": { "must_not": {"term": {"uid": collection.uid}}, "should": [{"term": {"featured": True}}], "minimum_should_match": 1, } } } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result["hits"]["hits"]: related_collections.append(res["_source"]) except Exception: pass return related_collections
def get_related_collections(collection): related_collections = [] conn = ES(['127.0.0.1:9200']) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q ={"query": { "bool" : { "must_not" : {"term" : { "uid" : collection.uid }}, "should" : [ {"terms" : { "subject" : [collection.subject] }}, {"terms" : { "topic" : [collection.topic] }}, ], "minimum_should_match" : 1, } } } try : query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: related_collections.append(res['_source']) except Exception: pass return related_collections
def searchCompletions(request): searchString = request.GET.get('searchString') maxCount = int(request.GET.get('maxCount')) conn = ES(['127.0.0.1:9200']) conn.default_indices = COMPLETION_INDEX conn.refresh(COMPLETION_INDEX) q = { "query": { "query_string": { "fields": ["searchTerm.partial"], "query": searchString } }, "facets": { "facet": { "terms": { "fields": ["searchTerm"], "size": MAX_RESULT_SIZE } } }, "size": maxCount } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) result_list = [] done_list = [] for res in result['hits']['hits']: if res['_source']['type'] != "Collections": result_list.append(res['_source']) res['_source']['count'] = 0 elif res['_source']['searchTerm'] not in done_list: val = str(res['_source']['searchTerm']).lower() for term in result['facets']['facet']['terms']: if val == term['term']: res['_source']['count'] = term['count'] done_list.append(res['_source']['searchTerm']) result_list.append(res['_source']) if len(result_list) == 0: result_list.append( {"searchTerm": "No Results"} ) # for now just displaying no results when nothing is found in completion resp = json.dumps({ "responseCode": "OK", "requestParameters": { "searchString": searchString, "maxCount": unicode(maxCount) }, "completions": result_list, "totalCount": unicode(maxCount) }) return HttpResponse(resp) except Exception, ex: return HttpResponse('0')
def searchCompletions(request): searchString = request.GET.get('searchString') maxCount = int(request.GET.get('maxCount')) conn = ES(['127.0.0.1:9200']) conn.default_indices = COMPLETION_INDEX conn.refresh(COMPLETION_INDEX) q = {"query" : { "query_string" :{ "fields" : ["searchTerm.partial"], "query" : searchString } }, "facets" : { "facet" :{ "terms": { "fields" : [ "searchTerm"], "size" : MAX_RESULT_SIZE } } }, "size" : maxCount } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) result_list = [] done_list = [] for res in result['hits']['hits']: if res['_source']['type'] != "Collections": result_list.append(res['_source']) res['_source']['count'] = 0 elif res['_source']['searchTerm'] not in done_list: val = str(res['_source']['searchTerm']).lower() for term in result['facets']['facet']['terms']: if val == term['term'] : res['_source']['count'] = term['count'] done_list.append(res['_source']['searchTerm']) result_list.append(res['_source']) if len(result_list) == 0: result_list.append({"searchTerm" : "No Results"}) # for now just displaying no results when nothing is found in completion resp = json.dumps({"responseCode":"OK","requestParameters":{"searchString":searchString,"maxCount":unicode(maxCount)},"completions": result_list, "totalCount": unicode(maxCount)}) return HttpResponse(resp) except Exception, ex: return HttpResponse('0')
def main(fn, args): conn = ES(args.host, bulk_size=10*args.bulksize) if fn.endswith(".gz"): fp = gzip.open(fn) else: fp = open(fn) count = 0 total = 0 try: for line in fp: doc = json.loads(line.strip()) if doc.get("_id"): _id = doc["_id"] del doc["_id"] else: _id = None conn.index(doc=doc, index=args.index, doc_type=args.doctype, id=_id, bulk=True) count+=1 total+=1 if count % args.bulksize == 0: flush(conn, count) count = 0 except: print "traceback", "".join(traceback.format_exception(*sys.exc_info())) raise finally: fp.close() try: flush(conn, count) conn.refresh(args.index) except: pass print "Indexed %s docs total"%total
def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': {'store': 'yes', 'type': u'integer'}, u'link': {'store': 'yes', 'type': u'string'}, u'title': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string'}, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
def get_collections_from_elasticsearch(request): params = request.GET language_name = params.get('language__name', None) # TODO: Change this from 'None'? searchString = params.get('searchString', 'None') partner_uid = params.get('uid', None) featured = params.get('featured', None) # TODO: Change this from 'None'? if searchString != 'None': match_query = {"flt" : {"fields" : ["_all", "subject.partial", "language.partial", "partner.partial", "state.partial", "category.partial", "subcategory.partial" , "topic.partial"], "like_text" : searchString } } elif partner_uid: partner_name = Partner.objects.get(uid = partner_uid).name match_query = {"match" : {"partner" :{ "query" : partner_name}}} else: match_query = {"match_all" : {}} query = [] filter = [] if language_name == 'All Languages': language_name = None query = create_query(params, language_name) if query: filter = {"and" : query} order_by = params.get('order_by','-featured') offset = int(params.get('offset')) limit = int(params.get('limit')) order_by = order_by[1:] #removing '-' since it will always be '-' conn = ES(['127.0.0.1:9200']) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q ={"query": { "filtered":{ "query" : match_query, "filter" : filter } }, "facets" : { "facet" :{ "terms": { "fields" : ["language", "partner", "state", "category", "subcategory" , "topic", "subject"], "size" : MAX_RESULT_SIZE } } }, "sort" : { order_by : {"order" : "desc"} }, "size" : MAX_RESULT_SIZE } result_list = [] try : query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: result_list.append(res['_source']) facets = json.dumps(result['facets']['facet']['terms']) if result_list: resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": str(len(result_list))},"objects": result_list[offset:offset+limit], "facets" : facets}) else: resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": "1"},"objects": [{'Message': 'No Collections Found', 'error': "1"}], "facets" : facets}) return HttpResponse(resp) except Exception, ex: print ex return HttpResponse(str(ex))
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. We are using elastic native fields for _id and ns, but we also store them as fields in the document, due to compatibility issues. """ def __init__(self, url, auto_commit=True, unique_key='_id'): """Verify Elastic URL and establish a connection. """ if verify_url(url) is False: raise SystemError self.elastic = ES(server=url) self.auto_commit = auto_commit self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key if auto_commit: self.run_auto_commit() def stop(self): """ Stops the instance """ self.auto_commit = False def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] doc[self.unique_key] = str(doc[self.unique_key]) doc_id = doc[self.unique_key] id_query = TextQuery('_id', doc_id) elastic_cursor = self.elastic.search(query=id_query, indices=index) try: self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id) except ValueError: logging.info("Could not update %s" % (doc,)) self.elastic.refresh() def remove(self, doc): """Removes documents from Elastic The input is a python dictionary that represents a mongo document. """ try: self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key])) except (NotFoundException, TypeMissingException, IndexMissingException): pass def _remove(self): """For test purposes only. Removes all documents in test.test """ try: self.elastic.delete('test.test', 'string', '') except (NotFoundException, TypeMissingException, IndexMissingException): pass def search(self, start_ts, end_ts): """Called to query Elastic for documents in a time range. """ res = ESRange('_ts', from_value=start_ts, to_value=end_ts) results = self.elastic.search(RangeQuery(res)) return results def _search(self): """For test purposes only. Performs search on Elastic with empty query. Does not have to be implemented. """ results = self.elastic.search(MatchAllQuery()) return results def commit(self): """This function is used to force a refresh/commit. """ retry_until_ok(self.elastic.refresh) def run_auto_commit(self): """Periodically commits to the Elastic server. """ self.elastic.refresh() if self.auto_commit: Timer(1, self.run_auto_commit).start() def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc') for item in result: return item
class ESIndexerBase(object): ES_HOST = ES_HOST ES_INDEX_NAME = ES_INDEX_NAME ES_INDEX_TYPE = 'gene' def __init__(self): self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME], timeout=10.0) self.step = 10000 def create_index(self): try: print self.conn.open_index(self.ES_INDEX_NAME) except IndexMissingException: print self.conn.create_index(self.ES_INDEX_NAME) def delete_index_type(self, index_type): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # index_type = self.ES_INDEX_TYPE #Check if index_type exists mapping = self.conn.get_mapping(index_type, index_name) if index_name not in mapping or index_type not in mapping[index_name]: print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name) return path = '/%s/%s' % (index_name, index_type) if ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.delete_mapping(index_name, index_type) def index(self, doc, index_type, id=None): '''add a doc to the index. If id is not None, the existing doc will be updated. ''' # index_type = self.ES_INDEX_TYPE return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id) def delete_index(self, index_type, id): '''delete a doc from the index based on passed id.''' # index_type = self.ES_INDEX_TYPE return self.conn.delete(self.ES_INDEX_NAME, index_type, id) def optimize(self): return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True) def get_field_mapping(self): import dataload reload(dataload) dataload.register_sources() return dataload.get_mapping() def build_index(self, doc_d, update_mapping=False, bulk=True): index_name = self.ES_INDEX_NAME index_type = self.ES_INDEX_TYPE #Test if index exists try: print "Opening index...", self.conn.open_index(index_name) except NotFoundException: print 'Error: index "%s" does not exist. Create it first.' % index_name return -1 try: cur_mapping = self.conn.get_mapping(index_type, index_name) empty_mapping = False except ElasticSearchException: #if no existing mapping available for index_type #force update_mapping to True empty_mapping = True update_mapping = True # empty_mapping = not cur_mapping[index_name].get(index_type, {}) # if empty_mapping: # #if no existing mapping available for index_type # #force update_mapping to True # update_mapping = True if update_mapping: print "Updating mapping...", if not empty_mapping: print "\n\tRemoving existing mapping...", print self.conn.delete_mapping(index_name, index_type) _mapping = self.get_field_mapping() print self.conn.put_mapping(index_type, _mapping, [index_name]) print "Building index..." t0 = time.time() for doc_id, doc in doc_d.items(): self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk) print self.conn.flush() print self.conn.refresh() print "Done[%s]" % timesofar(t0) def query(self, qs, fields='symbol,name', **kwargs): _q = StringQuery(qs) res = self.conn.search(_q, fields=fields, **kwargs) return res
def get_collections_from_elasticsearch(request): params = request.GET language_name = params.get('language__name', None) # TODO: Change this from 'None'? searchString = params.get('searchString', 'None') partner_uid = params.get('uid', None) # TODO: Change this from 'None'? if searchString != 'None': match_query = {"flt" : {"fields" : ["_all", "subject.partial", "language.partial", "partner.partial", "state.partial", "category.partial", "subcategory.partial" , "topic.partial"], "like_text" : searchString } } elif partner_uid: partner_name = Partner.objects.get(uid = partner_uid).name match_query = {"match" : {"partner" :{ "query" : partner_name}}} else: match_query = {"match_all" : {}} query = [] filter = [] if language_name == 'All Languages': language_name = None query = create_query(params, language_name) if query: filter = {"and" : query} order_by = params.get('order_by','-likes') offset = int(params.get('offset')) limit = int(params.get('limit')) order_by = order_by[1:] #removing '-' since it will always be '-' conn = ES(['127.0.0.1:9200']) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q ={"query": { "filtered":{ "query" : match_query, "filter" : filter } }, "facets" : { "facet" :{ "terms": { "fields" : ["language", "partner", "state", "category", "subcategory" , "topic", "subject"], "size" : MAX_RESULT_SIZE } } }, "sort" : { order_by : {"order" : "desc"} }, "size" : MAX_RESULT_SIZE } result_list = [] try : query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: result_list.append(res['_source']) facets = json.dumps(result['facets']['facet']['terms']) resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": str(len(result_list))},"objects": result_list[offset:offset+limit], "facets" : facets}) return HttpResponse(resp) except Exception, ex: return HttpResponse('0')