class ElasticSearchBackend(BaseBackend):
    
    def __init__(self, es_url='http://localhost:9200/', batch_size=10, **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))
            
        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(set) #track urls to be deleted before committing new content
        self.batches = defaultdict(list) #site: [list of docs]
    
    def create_index(self, name):
        name = name.lower()
        try:
            self.es.create_index(name)
            self.update_mapping(name)
        except Exception, e:
            print e
            return
Example #2
0
def dump_as_index_file():
    
    es = ElasticSearch(CONTEXT['datahub-store'])
    
    total = es.count("owner:public AND display:timeline",index=CONTEXT['datahub-index'],doc_type='_all')
    
    series = es.search("owner:public AND display:timeline",index=CONTEXT['datahub-index'],size=total['count'],doc_type='_all')
    
    f = open(CONTEXT['correlation-index-path'],mode='w')
    
    for serie in series['hits']['hits']:
        f.write("%s;%s;%s;%s\n" % (serie['_id'],serie['_source']['name'],cjson.encode(serie['_source']['data']['series'][0]['data']),serie['_source']['category']))
    f.close()
Example #3
0
class ElasticSearchBackend(BaseBackend):
    def __init__(self,
                 es_url='http://localhost:9200/',
                 batch_size=10,
                 **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))

        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(
            set)  #track urls to be deleted before committing new content
        self.batches = defaultdict(list)  #site: [list of docs]

    def create_index(self, name):
        name = name.lower()
        try:
            self.es.create_index(name)
            self.update_mapping(name)
        except Exception, e:
            print e
            return
Example #4
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self, qkey, **kwargs):
        q = {"query": {"bool": {}}}
        query_name = "should"
        q["query"]["bool"]["minimum_number_should_match"] = 1
        kwargs.pop("qtype", "")

        placetokens = [
            l.strip() for l in tokenizer.split(qkey)
            if l and l not in STOP_WORDS and l[-1] != '.'
        ]
        if placetokens:
            reduced_placename = u" ".join(placetokens[0:])
            if len(placetokens[0]) < 3 and len(
                    placetokens) > 1 and 3.0 / len(placetokens) >= .5:
                reduced_placename = u" ".join(placetokens[1:])
        else:
            reduced_placename = qkey

        # print "qkey", qkey, "reduced", reduced_placename
        maincondition = [
            {
                "bool": {
                    "must": [{
                        "multi_match": {
                            "query":
                            qkey,
                            "fields":
                            ["name.raw^5", "asciiname^5", "alternatenames"],
                            "operator":
                            "and"
                        }
                    }, {
                        "terms": {
                            "featureClass": ["a", "p"]
                        }
                    }],
                }
            },
            {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "normalized_asciiname": {
                        "value": qkey
                    }
                }
            },
            # {"term": {"alternatenames": {"value": qkey[1:]}}},
            {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            },
            # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)),
            {
                "multi_match": {
                    "query":
                    reduced_placename if 'fuzzy' in kwargs else unicode(
                        unidecode(reduced_placename)),
                    'fuzziness':
                    kwargs.pop("fuzzy", 0),
                    "max_expansions":
                    kwargs.pop("max_expansion", 10),
                    "prefix_length":
                    kwargs.pop("prefix_length", 1),
                    'operator':
                    kwargs.pop("operator", "and"),
                    "fields": [
                        "name^3", "asciiname^3", "alternatenames",
                        "normalized_asciiname^3"
                    ]
                }
            }
        ]

        q["query"]["bool"][query_name] = maincondition

        if kwargs:
            filter_cond = []
            if 'min_popln' in kwargs:
                popln = kwargs.pop("min_popln")
                if popln is not None:
                    filter_cond.append(
                        {"range": {
                            "population": {
                                "gte": popln
                            }
                        }})

            for key, val in kwargs.viewitems():
                if not isinstance(val, basestring):
                    val = list([(v) for v in val])
                    filter_cond.append({"terms": {key: val}})
                else:
                    filter_cond.append({"term": {key: (val)}})

            q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}

        q['from'] = 0
        q['size'] = 50
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        # print(max_score)
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def _oldquery(self,
                  qkey,
                  qtype="exact",
                  analyzer=None,
                  min_popln=None,
                  size=10,
                  **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def oldquery(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [
                        {
                            "geo_distance": {
                                "distance": "30km",
                                "coordinates": geo_point
                            }
                        },
                        {
                            "terms":
                            # {"featureCode":
                            #  ["pcli", "ppl", "ppla2", "adm3"]}
                            {
                                "featureClass": ["a", "h", "l", "t", "p", "v"]
                            }
                        }
                    ]
                }
            },
            "sort": {
                "population": "desc"
            }
        }
        if kwargs:
            for key in kwargs:
                q2['query']['bool']['filter'].append(
                    {"term": {
                        key: kwargs[key]
                    }})

        res = self.eserver.search(
            q2, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        ere = re.compile("[^\sa-zA-Z0-9]")
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].lower(
                    ).split(",")
                    row['normalized_asciiname'] = (re.sub(
                        r'\s+', r' ', ere.sub("", row['asciiname']))).strip()
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue

    def remove_dynamic_stopwords(self, term):
        # cc = {}
        # ttl = 0
        words = [w for t in term.split("-") for w in t.split() if len(w) > 1]

        if len(words) == 1:
            return term

        stopword_removed = ""
        for word in words:
            try:
                t = self.eserver.count(word)['count']
                if t >= 20000:
                    continue
            except:
                pass

            stopword_removed += (word + " ")
            # else:
            #     print(term, "stopword ", word)

        return stopword_removed.strip()