Ejemplo n.º 1
0
def get_related_videos(video):
    related_videos = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = VIDEO_INDEX
    conn.refresh(VIDEO_INDEX)
    q = {
        "query": {
                 "bool": {
                           "should": [
                                       {"term"  : { "uid" : video.uid } },
                                       {"terms" : { "category" : [video.category]}},
                                       {"terms" : { "topic" : [video.topic]}},
                                       {"terms" : { "language" : [video.language]}}
                                       ],
                           "minimum_should_match" : 1
                           }
                 }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % VIDEO_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_videos.append(res['_source'])
    except Exception:
        pass
    return related_videos
Ejemplo n.º 2
0
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {
            'store': 'yes',
            'type': u'integer'
        },
        u'link': {
            'store': 'yes',
            'type': u'string'
        },
        u'title': {
            'boost': 1.0,
            'index': 'analyzed',
            'store': 'yes',
            'type': u'string'
        },
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
Ejemplo n.º 3
0
def get_related_collections(collection, featured):
    related_collections = []
    conn = ES(["127.0.0.1:9200"])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q = {
        "query": {
            "bool": {
                "must_not": {"term": {"uid": collection.uid}},
                "should": [{"terms": {"subject": [collection.subject]}}, {"terms": {"topic": [collection.topic]}}],
                "minimum_should_match": 1,
            }
        }
    }
    if featured:
        q = {
            "query": {
                "bool": {
                    "must_not": {"term": {"uid": collection.uid}},
                    "should": [{"term": {"featured": True}}],
                    "minimum_should_match": 1,
                }
            }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result["hits"]["hits"]:
            related_collections.append(res["_source"])
    except Exception:
        pass
    return related_collections
Ejemplo n.º 4
0
def get_related_collections(collection):
    related_collections = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q ={"query": {
                        "bool" : {
                                  "must_not" : {"term" : { "uid" : collection.uid }},
                            "should" : [
                                        {"terms" : { "subject" : [collection.subject] }},
                                        {"terms" : { "topic" : [collection.topic] }},
                                        ],
                            "minimum_should_match" : 1,
                                }
                  }
        }
    try :
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_collections.append(res['_source'])
    except Exception:
        pass
    return related_collections
Ejemplo n.º 5
0
def get_related_videos(video):
    related_videos = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = VIDEO_INDEX
    conn.refresh(VIDEO_INDEX)
    q = {
        "query": {
                 "bool": {
                           "should": [
                                       {"term"  : { "uid" : video.uid } },
                                       {"terms" : { "category" : [video.category]}},
                                       {"terms" : { "topic" : [video.topic]}},
                                       {"terms" : { "language" : [video.language]}}
                                       ],
                           "minimum_should_match" : 1
                           }
                 }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % VIDEO_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_videos.append(res['_source'])
    except Exception:
        pass
    return related_videos
Ejemplo n.º 6
0
def searchCompletions(request):
    searchString = request.GET.get('searchString')
    maxCount = int(request.GET.get('maxCount'))
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = COMPLETION_INDEX
    conn.refresh(COMPLETION_INDEX)
    q = {
        "query": {
            "query_string": {
                "fields": ["searchTerm.partial"],
                "query": searchString
            }
        },
        "facets": {
            "facet": {
                "terms": {
                    "fields": ["searchTerm"],
                    "size": MAX_RESULT_SIZE
                }
            }
        },
        "size": maxCount
    }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        result_list = []
        done_list = []
        for res in result['hits']['hits']:
            if res['_source']['type'] != "Collections":
                result_list.append(res['_source'])
                res['_source']['count'] = 0
            elif res['_source']['searchTerm'] not in done_list:
                val = str(res['_source']['searchTerm']).lower()
                for term in result['facets']['facet']['terms']:
                    if val == term['term']:
                        res['_source']['count'] = term['count']
                        done_list.append(res['_source']['searchTerm'])
                result_list.append(res['_source'])
        if len(result_list) == 0:
            result_list.append(
                {"searchTerm": "No Results"}
            )  # for now just displaying no results when nothing is found in completion
        resp = json.dumps({
            "responseCode": "OK",
            "requestParameters": {
                "searchString": searchString,
                "maxCount": unicode(maxCount)
            },
            "completions": result_list,
            "totalCount": unicode(maxCount)
        })
        return HttpResponse(resp)
    except Exception, ex:
        return HttpResponse('0')
Ejemplo n.º 7
0
def searchCompletions(request):
    searchString = request.GET.get('searchString')
    maxCount = int(request.GET.get('maxCount'))
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = COMPLETION_INDEX
    conn.refresh(COMPLETION_INDEX)
    q = {"query" : {
                    "query_string" :{
                                    "fields" : ["searchTerm.partial"],
                                    "query" : searchString
                                    }
                    },
         "facets" : {
                    "facet" :{
                              "terms": {
                                        "fields" : [ "searchTerm"], 
                                        "size" : MAX_RESULT_SIZE
                                        }
                              }
                    },
         "size" : maxCount
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        result_list = []
        done_list = []
        for res in result['hits']['hits']:
            if res['_source']['type'] != "Collections":
                result_list.append(res['_source'])
                res['_source']['count'] = 0
            elif res['_source']['searchTerm'] not in done_list:
                val = str(res['_source']['searchTerm']).lower()
                for term in result['facets']['facet']['terms']:
                    if val == term['term'] :
                        res['_source']['count'] = term['count']
                        done_list.append(res['_source']['searchTerm'])
                result_list.append(res['_source'])
        if len(result_list) == 0:
            result_list.append({"searchTerm" : "No Results"})    # for now just displaying no results when nothing is found in completion
        resp = json.dumps({"responseCode":"OK","requestParameters":{"searchString":searchString,"maxCount":unicode(maxCount)},"completions": result_list, "totalCount": unicode(maxCount)})
        return HttpResponse(resp)
    except Exception, ex:
        return HttpResponse('0')
Ejemplo n.º 8
0
def main(fn, args):
    conn = ES(args.host, bulk_size=10*args.bulksize)
    if fn.endswith(".gz"):
        fp = gzip.open(fn)
    else:
        fp = open(fn)

    count = 0
    total = 0

    try:
        for line in fp:
            doc = json.loads(line.strip())
            if doc.get("_id"):
                _id = doc["_id"]
                del doc["_id"]
            else:
                _id = None

            conn.index(doc=doc,
                       index=args.index,
                       doc_type=args.doctype,
                       id=_id,
                       bulk=True)
            count+=1
            total+=1
            if count % args.bulksize == 0:
                flush(conn, count)
                count = 0
    except:
        print "traceback", "".join(traceback.format_exception(*sys.exc_info()))
        raise
    finally:
        fp.close()

    try:
        flush(conn, count)
        conn.refresh(args.index)
    except:
        pass

    print "Indexed %s docs total"%total
Ejemplo n.º 9
0
Archivo: app.py Proyecto: iamsk/es-demo
def init():
    conn = ES('127.0.0.1:9200')
    try:
        conn.delete_index("zhihu")
    except:
        pass
    conn.create_index("zhihu")
    mapping = {
        u'id': {'store': 'yes',
                'type': u'integer'},
        u'link': {'store': 'yes',
                  'type': u'string'},
        u'title': {'boost': 1.0,
                   'index': 'analyzed',
                   'store': 'yes',
                   'type': u'string'},
    }
    conn.put_mapping("answer", {'properties': mapping}, ["zhihu"])
    for item in Data().getData():
        conn.index(item, "zhihu", "answer", item['id'])
    conn.refresh(["zhihu"])
    return redirect('/list')
Ejemplo n.º 10
0
def get_collections_from_elasticsearch(request):
    params = request.GET
    language_name = params.get('language__name', None)
    # TODO: Change this from 'None'?
    searchString = params.get('searchString', 'None')
    partner_uid = params.get('uid', None)
    featured = params.get('featured', None)
    # TODO: Change this from 'None'?
    if searchString != 'None':
        match_query = {"flt" : {"fields" : ["_all", "subject.partial", "language.partial", "partner.partial", "state.partial", "category.partial", "subcategory.partial" , "topic.partial"],
                                "like_text" : searchString
                                }
                       }
    elif partner_uid:
        partner_name = Partner.objects.get(uid = partner_uid).name
        match_query = {"match" : {"partner" :{ "query" : partner_name}}}
    else:
        match_query = {"match_all" : {}}
    query = []
    filter = []
    if language_name == 'All Languages':
        language_name = None
    query = create_query(params, language_name)
    if query:
        filter = {"and" : query}
    order_by = params.get('order_by','-featured')
    offset = int(params.get('offset'))
    limit = int(params.get('limit'))
    order_by = order_by[1:] #removing '-' since it will always be '-'
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q ={"query": {
                  "filtered":{
                              "query" : match_query,
                              "filter" : filter
                              }
                  },
        "facets" : {
                    "facet" :{
                              "terms": {
                                        "fields" : ["language", "partner", "state", "category", "subcategory" , "topic", "subject"], 
                                        "size" : MAX_RESULT_SIZE
                                        }
                              }
                    },
        "sort" : {
                  order_by : {"order" : "desc"}
                  },
        "size" : MAX_RESULT_SIZE
        }
    result_list = []
    try :
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            result_list.append(res['_source'])
        facets = json.dumps(result['facets']['facet']['terms'])
        if result_list:
            resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": str(len(result_list))},"objects": result_list[offset:offset+limit], "facets" : facets})
        else:
            resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": "1"},"objects": [{'Message': 'No Collections Found', 'error': "1"}], "facets" : facets})
        return HttpResponse(resp)
    except Exception, ex:
        print ex
        return HttpResponse(str(ex))
Ejemplo n.º 11
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """

    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Elastic URL and establish a connection.
        """

        if verify_url(url) is False:
            raise SystemError
        self.elastic = ES(server=url)
        self.auto_commit = auto_commit
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """

        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc[self.unique_key])
        doc_id = doc[self.unique_key]
        id_query = TextQuery('_id', doc_id)
        elastic_cursor = self.elastic.search(query=id_query, indices=index)

        try:
            self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id)
        except ValueError:
            logging.info("Could not update %s" % (doc,))
        self.elastic.refresh()

    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        try:
            self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key]))
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def _remove(self):
        """For test purposes only. Removes all documents in test.test
        """
        try:
            self.elastic.delete('test.test', 'string', '')
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range.
        """
        res = ESRange('_ts', from_value=start_ts, to_value=end_ts)
        results = self.elastic.search(RangeQuery(res))
        return results

    def _search(self):
        """For test purposes only. Performs search on Elastic with empty query.
        Does not have to be implemented.
        """
        results = self.elastic.search(MatchAllQuery())
        return results

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.refresh)

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.refresh()

        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """

        result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc')
        for item in result:
            return item
Ejemplo n.º 12
0
class ESIndexerBase(object):
    ES_HOST = ES_HOST
    ES_INDEX_NAME = ES_INDEX_NAME
    ES_INDEX_TYPE = 'gene'

    def __init__(self):
        self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME],
        	           timeout=10.0)
        self.step = 10000

    def create_index(self):
        try:
            print self.conn.open_index(self.ES_INDEX_NAME)
        except IndexMissingException:
            print self.conn.create_index(self.ES_INDEX_NAME)

    def delete_index_type(self, index_type):
        '''Delete all indexes for a given index_type.'''
        index_name = self.ES_INDEX_NAME
#        index_type = self.ES_INDEX_TYPE
        #Check if index_type exists
        mapping = self.conn.get_mapping(index_type, index_name)
        if index_name not in mapping or index_type not in mapping[index_name]:
            print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)
            return
        path = '/%s/%s' % (index_name, index_type)
        if ask('Confirm to delete all data under "%s":' % path) == 'Y':
            return self.conn.delete_mapping(index_name, index_type)

    def index(self, doc, index_type, id=None):
        '''add a doc to the index. If id is not None, the existing doc will be
           updated.
        '''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id)

    def delete_index(self, index_type, id):
        '''delete a doc from the index based on passed id.'''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.delete(self.ES_INDEX_NAME, index_type, id)

    def optimize(self):
        return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True)

    def get_field_mapping(self):
        import dataload
        reload(dataload)
        dataload.register_sources()
        return dataload.get_mapping()

    def build_index(self, doc_d, update_mapping=False, bulk=True):
        index_name = self.ES_INDEX_NAME
        index_type = self.ES_INDEX_TYPE

        #Test if index exists
        try:
            print "Opening index...", self.conn.open_index(index_name)
        except NotFoundException:
            print 'Error: index "%s" does not exist. Create it first.' % index_name
            return -1

        try:
            cur_mapping = self.conn.get_mapping(index_type, index_name)
            empty_mapping = False
        except ElasticSearchException:
            #if no existing mapping available for index_type
            #force update_mapping to True
            empty_mapping = True
            update_mapping = True

#        empty_mapping = not cur_mapping[index_name].get(index_type, {})
#        if empty_mapping:
#            #if no existing mapping available for index_type
#            #force update_mapping to True
#            update_mapping = True

        if update_mapping:
            print "Updating mapping...",
            if not empty_mapping:
                print "\n\tRemoving existing mapping...",
                print self.conn.delete_mapping(index_name, index_type)
            _mapping = self.get_field_mapping()
            print self.conn.put_mapping(index_type,
                                   _mapping,
                                   [index_name])
        print "Building index..."
        t0 = time.time()
        for doc_id, doc in doc_d.items():
            self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk)
        print self.conn.flush()
        print self.conn.refresh()
        print "Done[%s]" % timesofar(t0)

    def query(self, qs, fields='symbol,name', **kwargs):
        _q = StringQuery(qs)
        res = self.conn.search(_q, fields=fields, **kwargs)
        return res
Ejemplo n.º 13
0
def get_collections_from_elasticsearch(request):
    params = request.GET
    language_name = params.get('language__name', None)
    # TODO: Change this from 'None'?
    searchString = params.get('searchString', 'None')
    partner_uid = params.get('uid', None)
    # TODO: Change this from 'None'?
    if searchString != 'None':
        match_query = {"flt" : {"fields" : ["_all", "subject.partial", "language.partial", "partner.partial", "state.partial", "category.partial", "subcategory.partial" , "topic.partial"],
                                "like_text" : searchString
                                }
                       }
    elif partner_uid:
        partner_name = Partner.objects.get(uid = partner_uid).name
        match_query = {"match" : {"partner" :{ "query" : partner_name}}}
    else:
        match_query = {"match_all" : {}}
    query = []
    filter = []
    if language_name == 'All Languages':
        language_name = None
    query = create_query(params, language_name)
    if query:
        filter = {"and" : query}
    order_by = params.get('order_by','-likes')
    offset = int(params.get('offset'))
    limit = int(params.get('limit'))
    order_by = order_by[1:] #removing '-' since it will always be '-'
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q ={"query": {
                  "filtered":{
                              "query" : match_query,
                              "filter" : filter
                              }
                  },
        "facets" : {
                    "facet" :{
                              "terms": {
                                        "fields" : ["language", "partner", "state", "category", "subcategory" , "topic", "subject"], 
                                        "size" : MAX_RESULT_SIZE
                                        }
                              }
                    },
        "sort" : {
                  order_by : {"order" : "desc"}
                  },
        "size" : MAX_RESULT_SIZE
        }
    result_list = []
    try :
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            result_list.append(res['_source'])
        facets = json.dumps(result['facets']['facet']['terms'])
        
        resp = json.dumps({"meta": {"limit": str(limit), "next": "", "offset": str(offset), "previous": "null", "total_count": str(len(result_list))},"objects": result_list[offset:offset+limit], "facets" : facets})
        return HttpResponse(resp)
    except Exception, ex:
        return HttpResponse('0')