def load_pie( filename, index_name, type_name, category, name, zone="France", sep=";", display="pie", source="", description="" ): f = open(filename, mode="r") es = ElasticSearch(CONTEXT["datahub-store"]) categories = {} for line in f: key, string_value = line.split(sep, 2) value = cjson.decode(string_value) categories[key] = value serie = { "name": name, "owner": "public", "display": display, "zone": zone, "category": category, "source": source, "description": description % (key), "data": {"categories": categories.keys(), "series": [{"data": categories.values()}]}, } es.index(index_name, display, serie) es.refresh(index_name) f.close() es.refresh(index_name) f.close()
def main(): #Train the Naive Bayes Classifier f=open('./data_set/naivebayes_trained_model.pickle') NBClassifier=pickle.load(f) #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store #the contents of the tweet file in that Index es=ElasticSearch('http://localhost:9200/') es_indexer() ############Indexing into Elasticsearch############ i=0 for each in tweet_data(): i+=1 testTweet= each processedTestTweet=process_tweet(testTweet) sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet))) es.index("sentiment_analysis","document",{ "text": testTweet, "sentiment": sentiment },id=i) print "Indexing completed." es.refresh(index="sentiment_analysis") print "Index refreshed." f.close()
class Indexer(object): def __init__(self, input): self.input = input self.es = ElasticSearch() self.index_name = "psim" self.doc_type = 'book' def delete_index(self): # Delete index if already found one try: self.es.delete_index(index = self.index_name) except Exception: pass def create_index(self): self.es.create_index(index=self.index_name, settings = self.get_index_settings()) def get_index_settings(self): settings = { "mappings": { "book": { "_all" : {"enabled" : "false"}, "properties": { "codes": {"type": "string", "term_vector": "yes", "store": "true"}, "pid" : {"type" : "string"}, "embedding": {"type": "float", "store": "true"}, "magnitude": {"type": "float", "store": "true"} } } } } return settings def documents(self): with open(self.input) as input_file: for line in input_file: json_doc = json.loads(line) yield self.es.index_op(json_doc, doc_type=self.doc_type) def index(self): self.delete_index() self.create_index() for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000): self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type) self.es.refresh(self.index_name)
def resource_create(request, index): data = json.loads(request.POST['data']) data['new'] = True # random identifier data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32)) # store data in elasticsearch es = ElasticSearch(settings.ELASTICSEARCH_URL) if index == 'persons': es.index(index, "person", data) elif index == 'institutes': es.index(index, "institute", data) es.refresh(index) return JsonResponse(data)
def test_cluster_size_3(self): cluster = self._make_one(size=3) cluster.start() self.assertEqual(len(cluster), 3) self.assertEqual(len(cluster.hosts), 3) self.assertEqual(len(os.listdir(cluster.working_path)), 3) self.assertEqual(len(cluster.urls), 3) client = ElasticSearch(cluster.urls, max_retries=2) self.assertEqual(client.health()['number_of_nodes'], 3) # test if routing works and data is actually distributed across nodes client.create_index('test_shards', settings={ 'number_of_shards': 1, 'number_of_replicas': 2, }) client.index('test_shards', 'spam', {'eggs': 'bacon'}) client.refresh('test_shards') shard_info = client.status()['indices']['test_shards']['shards']['0'] nodes = set([s['routing']['node'] for s in shard_info]) self.assertTrue(len(nodes) > 1)
def load_wordcloud( filename, index_name, type_name, category, name, zone="France", sep=";", display="wordcloud", source="", description="", ): f = open(filename, mode="r") es = ElasticSearch(CONTEXT["datahub-store"]) categories = [] for line in f: key, string_value = line.split(sep, 2) value = cjson.decode(string_value) categories.append((value["label"], value["norm_count"] / 100.0)) serie = { "name": name, "owner": "public", "display": display, "zone": zone, "category": category, "source": source, "description": unicode(description) % (name), "data": { "categories": map(lambda item: item[0], categories), "series": [{"data": map(lambda item: item[1], categories)}], }, } es.index(index_name, display, serie) es.refresh(index_name) f.close()
"coordinates" : coords, # 4, 5 "feature_class" : row[6], "feature_code" : row[7], "country_code2" : row[8], "country_code3" : country_code3, "cc2" : row[9], "admin1_code" : row[10], "admin2_code" : row[11], "admin3_code" : row[12], "admin4_code" : row[13], "population" : row[14], "elevation" : row[15], "dem" : row[16], "timzeone" : row[17], "modification_date" : "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
} } es.health(wait_for_status='yellow') es.delete_index('write-ads') es.create_index('write-ads', settings={'mappings': ad_mapping}) dateYMD = args["date"] prepareDataFromDB(dateYMD) dir = DATA_FILES_JSON + '/' + dateYMD for filename in os.listdir(dir): if filename.endswith('.json'): with open(dir + '/' + filename) as open_file: json_docs = json.load(open_file) es.bulk((es.index_op(doc) for doc in json_docs), index='write-ads', doc_type='ad') es.refresh("write-ads") res = es.search('website:com', index='write-ads') print("Got %d Hits for .com websites" % res['hits']['total']) for hit in res['hits']['hits']: print (hit["_source"]) res = es.search('website:in', index='write-ads') print("Got %d Hits for .in websites" % res['hits']['total']) res = es.search('category:entertainment', index='write-ads') print("Got %d Hits for category:Entertainment" % res['hits']['total'])
def save(self, force_insert=False, force_update=False, **kwargs): es = ElasticSearch(ELASTIC_SEARCH_URL) if self.id: location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.update('glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id), script="ctx._source.listing_id = listing;" + "ctx._source.artist_id = artist;" + "ctx._source.artist_avatar = artist_avatar;" + "ctx._source.title = title;" + "ctx._source.location = location;" + "ctx._source.description = description;" + "ctx._source.get_picture = get_picture;" + "ctx._source.metadata = metadata;" + "ctx._source.price = price;" + "ctx._source.likes = likes;" + "ctx._source.comments = comments;" + "ctx._source.tags = tags;" + "ctx._source.status = status;" + "ctx._source.style = style;" + "ctx._source.rating = rating", params={ 'listing':self.id, 'artist':self.get_artist_id(), 'artist_avatar':self.get_artist_avatar(), 'title':self.title, 'location':location_es, 'description':self.description, 'get_picture':self.get_picture(), 'metadata':self.metadata, 'price':self.price, 'likes':self.likes, 'comments':self.comments, 'tags':self.get_tags(), 'status':self.status, 'style':self.get_style(), 'rating':self.get_rating() }) super(Listing, self).save(force_insert, force_update) else: super(Listing, self).save(force_insert, force_update) artist_user = self.artist.user artist_name = artist_user.first_name followers = Followers.objects.select_related().filter(artist=self.artist) for follower in followers: Notification.objects.create( sender = artist_user, receiver = follower.user, time = current_time(), short_text = NOTIFICATIONS_SHORT[10].format(artist=artist_name), long_text = NOTIFICATIONS_LONG[10].format(artist=artist_name, listing=self.title, user_id=self.artist_id, metadata=self.id), ) location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.index('glamazer', 'modelresult', { 'listing_id': self.id, 'artist_id': self.artist_id, 'artist_avatar':self.get_artist_avatar(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'price': self.price, 'likes': self.likes, 'comments':self.comments, 'tags': self.get_tags(), 'status':self.status, 'style':self.get_style(), 'rating':self.get_rating() }, id='listings.listing.{0}'.format(self.id)) es.refresh('glamazer')
"coordinates": coords, # 4, 5 "feature_class": row[6], "feature_code": row[7], "country_code2": row[8], "country_code3": country_code3, "cc2": row[9], "admin1_code": row[10], "admin2_code": row[11], "admin3_code": row[12], "admin4_code": row[13], "population": row[14], "elevation": row[15], "dem": row[16], "timzeone": row[17], "modification_date": "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
'id': 2, 'name': 'Jessica Coder', 'age': 31, 'title': 'Programmer' }, { 'id': 3, 'name': 'Freddy Coder抽', 'age': 29, 'title': 'Office Assistant' }] es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs), index='test', doc_type='test') es.refresh('test') res1 = es.get('test', 'test', 1) # 全文匹配, 注意中英文的分词方式. # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html res8 = es.search(index='test', size=2, query={"query": { "query_string": { "query": "抽" } }}) # 前缀匹配查询,只接受小写.
def main(): #Open the below file for writing #fhand = open('./data_set/full_feature_list.txt', 'w') #Below is the list of tuples tweet words, sentiment # tweets = [] # #featureList=[] # input_tweets = csv.reader(open('./data_set/full_training_dataset _main.csv', 'rb'), delimiter=',')#, quotechar='|') # #initial_tweets=open('./data_set/train_data.csv', 'rU') # #input_tweets = csv.reader((line.replace('\0','') for line in initial_tweets), delimiter=",") # start6=time.clock() # print 'start-time','->',start6 # for row in input_tweets: # sentiment = row[0] # tweet = row[1] # #Callin process_tweet method to pre process the tweet # processed_tweet = process_tweet(tweet) # #Calling build_feature_list that returns the list of feature vector # featureVector = build_feature_vector(processed_tweet) # #featureList.append(featureVector) # # for each in featureVector: # # fhand.write(each) # # fhand.write('\n') # tweets.append((featureVector, sentiment)) # print 'end-time','->',time.clock()-start6 # #fhand.close() #Extract feature vector for all tweets. In below function the inputs are extract features function and all #tweets, the extract features function picks one tweet from list if tweets and create feature vector. # start7=time.clock() # print 'start-time-training-time','->',start7 # training_set=apply_features(extract_features,tweets) # print type(training_set) # print 'end-time-training-time','->',time.clock()-start7 #Train the Naive Bayes Classifier start8=time.clock() print 'start-time-NB','->',start8 f=open('./data_set/naivebayes_trained_model.pickle') NBClassifier=pickle.load(f) print 'end-time-NB','->',time.clock()-start8 #NBClassifier = nltk.NaiveBayesClassifier.train(training_set) #f=open('./data_set/NBClassifier_trained_21k_tweets.pickle','wb') #pickle.dump(NBClassifier,f) #print 'end-time-NB','->',time.clock()-start8 #Print most informative features about the classifier #print NBClassifier.show_most_informative_features(10) #Test the classifier #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index #es_indexer() es=ElasticSearch('http://localhost:9200/') i=0 #testTweet = 'I will get internship happy!!!' for each in tweet_data(): i+=1 testTweet= each processedTestTweet=process_tweet(testTweet) sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet))) #print sentiment ############Indexing into Elasticsearch############ es.index("sentiment_analysis","document",{ "text": testTweet, "sentiment": sentiment },id=i) print "Indexing completed." es.refresh(index="sentiment_analysis") print "Index refreshed." #print extract_features(tweets[0][0]) f.close()
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list(client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
return data es = ElasticSearch('http://localhost:9200/') es.delete_index('pet') spider = Spider() breeds = spider.getPetBreeds() p = Pinyin() for breed in breeds: flg = 1 page = 1 pet_list = [] while(flg): pets = spider.getPets(breed, (page - 1) * spider.limit) if not pets: flg = 0 else: page = page + 1 for pet in pets: pet_obj = {} pet_obj['name'] = pet['name'] pet_obj['img'] = pet['img'] pet_obj['type'] = breed['ename'] pet_list.append(pet_obj) #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '') print breed['ename'] + '\n' if not pet_list: continue doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '') es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet') es.refresh('pet')
def scrape(url): """ Hit the page, save the contents """ if USE_ES: es = ElasticSearch(ES_URL) page_num = 1 while True: # Replace the wildcard with the current page number r = requests.get(url.replace('*', str(page_num)), # Be a good net citizen headers={ 'User-Agent': 'python-wbawsearch' } ) # We have hit the last page, stop if r.status_code == 404: return # Show where we are print r.url # Save text for processing soup = BeautifulSoup(r.text) tables = soup.find_all(width=625) # Go through each table for table in tables: # Find all <td> cols = table.find_all('td') # Treat list of <td> as matrix, # pluck out cells we want for ii in range(5): # First cell page_url = str(cols[ii + 0].find('a').get('href')).strip() if page_url == 'indpages/wbawnowatch.html': # Empty cell break image_thumb_url = str(cols[ii + 0].find('img').get('src')).strip() image_full_url = image_thumb_url.replace('th', '').strip() # Second cell titles = cols[ii + 5].find_all('font') title = str(titles[0].find(text=True)).strip() short_description = str(titles[3].string).strip() # Third cell stk = str(cols[ii + 10].find(text=True)).strip() # Fourth cell price = str(cols[ii + 15].find('font').string).strip() # Fifth cell status = str(cols[ii + 20].find('font').string).strip() # Skip anything already sold if price.lower() != 'sold' and status.lower() != 'sold': print page_url # Get the long description long_description = scrape_page(BASE_URL + '/' + page_url) if USE_ES: # Get the raw stock number id = stk.split('#')[1].strip() # Convert images to data URIs #image_full_data = image_to_uri( # image_full_url #) #image_thumb_data = image_to_uri( # image_thumb_url #) # Add the document to the index es.index( ES_INDEX, "watch", dict( id=id, page_url=page_url, image_thumb_url=image_thumb_url, image_full_url=image_full_url, title=title, short_description=short_description, long_description=long_description, stk=stk, price=price, status=status, #image_full_data=image_full_data, #image_thumb_data=image_thumb_data ), id=id ) else: # Create a new watch object watch = Watch( page_url=page_url, image_thumb_url=image_thumb_url, image_full_url=image_full_url, title=title, short_description=short_description, long_description=long_description, stk=stk, price=price, status=status ) # Add and commit session.add(watch) session.commit() # Next page page_num += 1 # Sleep so this doesn't crash time.sleep(1) if USE_ES: es.refresh(ES_INDEX)
for term in terms: print term words = iterativeChildren(terms[term]['data']) for word in words: terms[term]['closure'].append(word) d = iterativeDev(terms[term]['data_with_develops_from']) for dd in d: terms[term]['closure_with_develops_from'].append(dd) terms[term]['closure'] = list(set(terms[term]['closure'])) terms[term]['closure'].append(term) terms[term]['closure_with_develops_from'] = list(set(terms[term]['closure_with_develops_from'])) terms[term]['closure_with_develops_from'].append(term) terms[term]['systems'] = getSystemSlims(term) terms[term]['organs'] = getOrganSlims(term) terms[term]['developmental'] = getDevelopmentSlims(term) # Indexing the data in ElasticSearch connection.index(index_name, doc_type_name, terms[term], id=term) if count % 1000 == 0: connection.flush(index=index_name) connection.refresh() count = count + 1 print print "Total GO Terms indexed " + str(count)
class SearchIndex(object): def __init__(self, model): self.es = ElasticSearch() self.model = model def put_mapping(self, index, doc_type): mapping = { doc_type: { "properties": { "location": { "type": "geo_point" }, } } } self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping) def bulk_items(self, index, doc_type): for m in self.model.objects.all(): self.es.bulk([ self.es.index_op({ "pk": m.pk, "name": m.name, "rating": m.rating, "address": m.address, "description": m.description, "location": { "lon": m.longitude, "lat": m.latitude } }), ], doc_type=doc_type, index=index) def search(self, index, question, longitude, latitude, size=10): #self.es.delete_index(index) try: self.es.create_index(index) self.put_mapping(index, "place") self.bulk_items(index, "place") except IndexAlreadyExistsError: pass query = { "query": { "function_score": { "query": { "bool": { "should": [ {"match": {"name": question}}, {"match": {"_all": { "query": question, "operator": "or", "fuzziness": "auto", "zero_terms_query": "all" }}} ] } }, "functions": [ {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}}, ] } } } if longitude and longitude is not None: query['query']['function_score']['functions'] = [ {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"} }}, {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"} }}, ] results = self.es.search(query, index=index, size=size) self.es.refresh() return results
def documents_from_mails(mails): """Build document from mail""" for mail in mails: if 'Date' in mail.headers: # Some mails seem broken. yield { '@source': 'stuff://', '@type': 'mailadmin', '@tags': [mail.headers['From']], '@fields': mail.headers, '@timestamp': parse_date(mail.headers['Date']), '@source_host': 'localhost', '@source_path': 'mail/admin ', '@message': mail.body, 'id': mail.headers['Message-Id'] } if __name__ == '__main__': # Instantiate it with an url es = ElasticSearch(sys.argv[1]) # Kibana need this kind of name NAME = 'logstash-2013.06.13' try: es.delete_index(NAME) except ElasticHttpNotFoundError: pass # Nobody cares emails = mbox(sys.argv[2]) for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)): es.bulk_index(NAME, 'mailadmin', docs) print(n) print es.refresh(NAME)
def getFeeds(): print "getting feeds" es = ElasticSearch('http://fisensee.ddns.net:9200/') query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}} oldFeeds = es.search(query, size=300, index='feeds') if (len(oldFeeds['hits']['hits']) is not 0): es.bulk( es.delete_op(id=feed['_id'], index='feeds', doc_type='feed') for feed in oldFeeds['hits']['hits']) feedSources = FeedSource.objects.all() feeds = [] defaultText = 'undefined' defaultDate = datetime.datetime.now().isoformat() utc = pytz.utc berlin = pytz.timezone('Europe/Berlin') now = datetime.datetime.today() dateThreshold = now - datetime.timedelta(weeks=2) allUrls = [] for feedSource in feedSources: allUrls.append(feedSource.sourceUrl) urls = set(allUrls) for url in urls: source = feedparser.parse(url) for entry in source['items']: feed = { 'title': defaultText, 'description': defaultText, 'link': defaultText, 'date': defaultDate, 'url': defaultText } if ('title' in entry): feed['title'] = entry['title'] if ('description' in entry): feed['description'] = entry['description'] if ('link' in entry): feed['link'] = entry['link'] if ('published_parsed' in entry): date = datetime.datetime.fromtimestamp( time.mktime(entry['published_parsed'])) if (date < dateThreshold): break utcDate = utc.localize(date) feed['date'] = utcDate.astimezone(berlin).isoformat() #id creation should be enough for now, but it's made to fail if ('title' or 'published_parsed' in entry): feed['id'] = base64.urlsafe_b64encode( hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest()) else: feed['id'] = base64.urlsafe_b64encode( hashlib.sha256((feed['title']).encode('utf8')).hexdigest()) feed['url'] = url feeds.append(feed) es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds), index='feeds', doc_type='feed') print es.refresh('feeds')
class Elastic(DataLayer): """ElasticSearch data layer.""" serializers = { 'integer': int, 'datetime': parse_date } def init_app(self, app): app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') app.config.setdefault('ELASTICSEARCH_INDEX', 'eve') self.es = ElasticSearch(app.config['ELASTICSEARCH_URL']) self.index = app.config['ELASTICSEARCH_INDEX'] def _get_field_mapping(self, schema): """Get mapping for given field schema.""" if schema['type'] == 'datetime': return {'type': 'date'} elif schema['type'] == 'string' and schema.get('unique'): return {'type': 'string', 'index': 'not_analyzed'} elif schema['type'] == 'string': return {'type': 'string'} def put_mapping(self, app): """Put mapping for elasticsearch for current schema. It's not called automatically now, but rather left for user to call it whenever it makes sense. """ for resource, resource_config in app.config['DOMAIN'].items(): properties = {} properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'}) properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'}) for field, schema in resource_config['schema'].items(): field_mapping = self._get_field_mapping(schema) if field_mapping: properties[field] = field_mapping datasource = (resource, ) # TODO: config.SOURCES not available yet (self._datasource_ex(resource)) mapping = {} mapping[datasource[0]] = {'properties': properties} self.es.put_mapping(self.index, datasource[0], mapping) def find(self, resource, req, sub_resource_lookup): """ TODO: implement sub_resource_lookup """ query = { 'query': { 'query_string': { 'query': request.args.get('q', '*'), 'default_field': request.args.get('df', '_all'), 'default_operator': 'AND' } } } if not req.sort and self._default_sort(resource): req.sort = self._default_sort(resource) # skip sorting when there is a query to use score if req.sort and 'q' not in request.args: query['sort'] = [] sort = ast.literal_eval(req.sort) for (key, sortdir) in sort: sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')]) query['sort'].append(sort_dict) if req.where: where = json.loads(req.where) if where: query['filter'] = { 'term': where } if req.max_results: query['size'] = req.max_results if req.page > 1: query['from'] = (req.page - 1) * req.max_results source_config = config.SOURCES[resource] if 'facets' in source_config: query['facets'] = source_config['facets'] try: args = self._es_args(resource) args['es_fiels'] = self._fields(resource) return self._parse_hits(self.es.search(query, **args), resource) except es_exceptions.ElasticHttpError: return ElasticCursor() def find_one(self, resource, **lookup): args = self._es_args(resource) args['es_fields'] = self._fields(resource) if config.ID_FIELD in lookup: try: hit = self.es.get(id=lookup[config.ID_FIELD], **args) except es_exceptions.ElasticHttpNotFoundError: return if not hit['exists']: return doc = hit.get('fields', hit.get('_source', {})) doc['_id'] = hit.get('_id') convert_dates(doc, self._dates(resource)) return doc else: query = { 'query': { 'constant_score': { 'filter': { 'term': lookup } } } } try: args['size'] = 1 docs = self._parse_hits(self.es.search(query, **args), resource) return docs.first() except es_exceptions.ElasticHttpNotFoundError: return None def find_list_of_ids(self, resource, ids, client_projection=None): args = self._es_args(resource) args['es_fields'] = self._fields(resource) return self._parse_hits(self.es.multi_get(ids, **args), resource) def insert(self, resource, doc_or_docs, **kwargs): ids = [] kwargs.update(self._es_args(resource)) for doc in doc_or_docs: doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs)) ids.append(doc['_id']) self.es.refresh(self.index) return ids def update(self, resource, id_, updates): args = self._es_args(resource, refresh=True) return self.es.update(id=id_, doc=updates, **args) def replace(self, resource, id_, document): args = self._es_args(resource, refresh=True) args['overwrite_existing'] = True return self.es.index(document=document, id=id_, **args) def remove(self, resource, id_=None): args = self._es_args(resource, refresh=True) if id_: return self.es.delete(id=id_, **args) else: try: return self.es.delete_all(**args) except es_exceptions.ElasticHttpNotFoundError: return def _parse_hits(self, hits, resource): """Parse hits response into documents.""" return ElasticCursor(hits, self._dates(resource)) def _es_args(self, resource, refresh=None): """Get index and doctype args.""" datasource = self._datasource(resource) args = { 'index': self.index, 'doc_type': datasource[0], } if refresh: args['refresh'] = refresh return args def _fields(self, resource): """Get projection fields for given resource.""" datasource = self._datasource(resource) keys = datasource[2].keys() return ','.join(keys) def _default_sort(self, resource): datasource = self._datasource(resource) return datasource[3] def _dates(self, resource): dates = [config.LAST_UPDATED, config.DATE_CREATED] datasource = self._datasource(resource) schema = config.DOMAIN[datasource[0]]['schema'] for field, field_schema in schema.items(): if field_schema['type'] == 'datetime': dates.append(field) return dates
class ElasticSearchProvider(SearchProvider): def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES') def activate_debug(self): self.debug = True def connect_to_db(self): from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING') engine = create_engine( conn_string, convert_unicode=True, pool_size=1, max_overflow=0, echo=self.debug ) maker = sessionmaker(bind=engine, autoflush=True) self.db = scoped_session(maker) def _assemble_inner_query(self, domain=None, page_filter=None): if page_filter and domain: page_prefix = '%s/%s' % (domain.url, page_filter) else: page_prefix = None if page_prefix: return { 'prefix': { 'page_url': page_prefix } } else: return { 'match_all': {} } def _assemble_outer_query(self, inner_query, filter_terms): return { 'filtered': { 'query': inner_query, 'filter': { 'and': [{ 'term': filter_term } for filter_term in filter_terms] } } } def _assemble_filter_terms(self, key_id=None, domain=None): filter_terms = [] if key_id: filter_terms.append({'keys.id': key_id}) if domain: filter_terms.append({'domain_id': domain.id}) return filter_terms def gen_doc(self, review): return { 'keys': [{'id': violation.key_id} for violation in review.violations], 'uuid': str(review.uuid), 'completed_date': review.completed_date, 'violation_count': review.violation_count, 'page_id': review.page_id, 'page_uuid': str(review.page.uuid), 'page_url': review.page.url, 'page_last_review_date': review.page.last_review_date, 'domain_id': review.domain_id, 'domain_name': review.domain.name, } def index_review(self, review): for attempt in range(self.max_retries): try: self.syncES.send_request( method='POST', path_components=[self.index, 'review', review.page_id], body=dumps(self.gen_doc(review)), encode_body=False ) break except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e: values = review.id, review.page_id, str(e) logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values)) time.sleep(1) if attempt >= self.max_retries - 1: raise else: raise def index_reviews(self, reviewed_pages, reviews_count, batch_size): action = {'index': {'_type': 'review'}} for i in range(0, reviews_count, batch_size): body_bits = [] for page in reviewed_pages[i:i + batch_size]: doc = self.gen_doc(page.last_review) action['index']['_id'] = doc['page_id'] body_bits.append(dumps(action)) body_bits.append(dumps(doc)) # Yes, that trailing newline IS necessary body = '\n'.join(body_bits) + '\n' self.syncES.send_request( method='POST', path_components=[self.index, '_bulk'], body=body, encode_body=False ) logging.info('Done!') @return_future def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) reviews_data = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) reviews_data.append({ 'uuid': hit['_source']['uuid'], 'page': { 'uuid': hit['_source']['page_uuid'], 'url': hit['_source']['page_url'], 'completedAt': completedAt }, 'domain': hit['_source']['domain_name'] }) reviews_count = hits.get('total', 0) callback({ 'reviews': reviews_data, 'reviewsCount': reviews_count }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain, page_filter) filter_terms = self._assemble_filter_terms(key_id, domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'completed_date': { 'order': 'desc' } }, { 'violation_count': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) @return_future def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) pages = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) pages.append({ 'url': hit['_source']['page_url'], 'uuid': hit['_source']['page_uuid'], 'violationCount': len(hit['_source']['keys']), 'completedAt': completedAt, 'reviewId': hit['_source']['uuid'] }) reviews_count = hits.get('total', 0) callback({ 'reviewsCount': reviews_count, 'pages': pages }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter) filter_terms = self._assemble_filter_terms(domain=domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'violation_count': { 'order': 'desc' } }, { 'completed_date': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) def refresh(self): try: self.syncES.refresh(index=self.index) except Exception as e: logging.error('Could not refresh index (%s)' % e) def get_index_settings(cls): return { 'index': { 'number_of_shards': 4 } } def get_index_mapping(cls): return { 'review': { 'properties': { 'keys': { 'properties': { 'id': { 'type': 'integer' } } }, 'uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'completed_date': { 'type': 'integer' }, 'violation_count': { 'type': 'float' }, 'page_id': { 'type': 'integer' }, 'page_uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'page_url': { 'type': 'string', 'index': 'not_analyzed' }, 'page_last_review_date': { 'type': 'integer' }, 'domain_id': { 'type': 'integer' }, 'domain_name': { 'type': 'string', 'index': 'not_analyzed' } } } } def setup_index(self): try: settings = self.get_index_settings() self.syncES.create_index(index=self.index, settings=settings) mapping = self.get_index_mapping() self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping) logging.info('Index %s created.' % self.index) except Exception as e: raise e def delete_index(self): try: self.syncES.delete_index(index=self.index) logging.info('Index %s deleted.' % self.index) except Exception as e: raise e def _get_max_page_id_from_index(self, must_have_domain_name=False): if must_have_domain_name: inner_query = { 'constant_score': { 'filter': { 'not': { 'missing': { 'field': 'domain_name' } } } } } else: inner_query = { 'match_all': {} } query = { 'query': inner_query, 'sort': [{ 'page_id': { 'order': 'desc' } }] } results = self.syncES.search(query, index=self.index, doc_type='review') if results['hits']['total'] > 0: return results['hits']['hits'][0]['_id'] or 0 return 0 def index_all_reviews(self, keys=None, batch_size=200, replace=False): logging.info('Querying database...') self.connect_to_db() if keys is not None: keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()] try: max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True) except Exception: logging.error('Could not retrieve max page_id! Use with --replace (with caution)') return def apply_filters(query): if keys is not None: query = query \ .filter(Violation.review_id == Page.last_review_id) \ .filter(Violation.key_id.in_(keys)) if not replace: query = query.filter(Page.id > max_page_id) return query.filter(Page.last_review_id != None) reviews_count = apply_filters(self.db.query(func.count(Page))).scalar() query = self.db.query(Page).options(joinedload('last_review')) reviewed_pages = apply_filters(query).order_by(Page.id.asc()) logging.info('Indexing %d reviews...' % reviews_count) self.index_reviews(reviewed_pages, reviews_count, batch_size) @classmethod def new_instance(cls, config): return ElasticSearchProvider(config) @classmethod def main(cls): import sys parser = cls.argparser() args = parser.parse_args() config = {} host = None port = None index = None es = None levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG'] log_level = levels[args.verbose] logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s') if not (args.create or args.recreate or args.delete or args.keys or args.all_keys): parser.print_help() sys.exit(1) if args.conf: from derpconf.config import ConfigurationError from holmes.config import Config try: config = Config().load(args.conf[0]) host = config['ELASTIC_SEARCH_HOST'] port = config['ELASTIC_SEARCH_PORT'] index = config['ELASTIC_SEARCH_INDEX'] except ConfigurationError: logging.error('Could not load config! Use --conf conf_file') sys.exit(1) except KeyError: logging.error('Could not parse config! Check it\'s contents') sys.exit(1) if args.server: try: host, port = args.server[0].split(':') config['ELASTIC_SEARCH_HOST'] = host config['ELASTIC_SEARCH_PORT'] = port except Exception: logging.error('Could not parse server host and port! Use --server host:port') sys.exit(1) if args.index: index = args.index[0] config['ELASTIC_SEARCH_INDEX'] = index from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError from requests.exceptions import ConnectionError try: if args.create or args.recreate or args.delete: if host is None or port is None: logging.error('Need either a host and port or a config file to perform such operation!') sys.exit(1) if index is None: logging.error('Need either an index name or a config file to perform such operation!') sys.exit(1) else: es = cls.new_instance(config) if args.recreate or args.delete: try: es.delete_index() except ElasticHttpNotFoundError: pass except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) if args.create or args.recreate: es.setup_index() if args.keys or args.all_keys: if config is None: logging.error('Need a config file to perform such operation! Use --conf conf_file') else: batch_size = args.batch_size[0] if args.batch_size else 200 es = cls.new_instance(config) if not es else es try: if args.verbose > 2: es.activate_debug() if args.keys: es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size) elif args.all_keys: es.index_all_reviews(replace=args.replace, batch_size=batch_size) except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) except IndexAlreadyExistsError: logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index) except ConnectionError: logging.error('Could not connect to server at %s:%s' % (host, port)) except KeyError: logging.error('Could not get host nor port! Use either -conf or --server') sys.exit(1)
def getFeeds(): print "getting feeds" es = ElasticSearch('http://fisensee.ddns.net:9200/') query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}} oldFeeds = es.search(query, size=300, index='feeds') if(len(oldFeeds['hits']['hits']) is not 0): es.bulk(es.delete_op(id=feed['_id'], index='feeds', doc_type='feed') for feed in oldFeeds['hits']['hits']) feedSources = FeedSource.objects.all() feeds = [] defaultText = 'undefined' defaultDate = datetime.datetime.now().isoformat() utc = pytz.utc berlin = pytz.timezone('Europe/Berlin') now = datetime.datetime.today() dateThreshold = now - datetime.timedelta(weeks=2) allUrls = [] for feedSource in feedSources: allUrls.append(feedSource.sourceUrl) urls = set(allUrls) for url in urls: source = feedparser.parse(url) for entry in source['items']: feed = { 'title':defaultText, 'description':defaultText, 'link':defaultText, 'date':defaultDate, 'url': defaultText } if('title' in entry): feed['title'] = entry['title'] if('description' in entry): feed['description'] = entry['description'] if('link' in entry): feed['link'] = entry['link'] if('published_parsed' in entry): date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) if(date < dateThreshold): break utcDate = utc.localize(date) feed['date'] = utcDate.astimezone(berlin).isoformat() #id creation should be enough for now, but it's made to fail if('title' or 'published_parsed' in entry): feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest()) else: feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest()) feed['url'] = url feeds.append(feed) es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds), index = 'feeds', doc_type = 'feed') print es.refresh('feeds')
def save(self, force_insert=False, force_update=False, **kwargs): es = ElasticSearch(ELASTIC_SEARCH_URL) if self.id: location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.update('glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id), script="ctx._source.listing_id = listing;" + "ctx._source.artist_id = artist;" + "ctx._source.artist_avatar = artist_avatar;" + "ctx._source.artist_name = artist_name;" + "ctx._source.salon_id = salon;" + "ctx._source.salon_avatar = salon_avatar;" + "ctx._source.salon_name = salon_name;" + "ctx._source.title = title;" + "ctx._source.location = location;" + "ctx._source.description = description;" + "ctx._source.get_picture = get_picture;" + "ctx._source.metadata = metadata;" + "ctx._source.gender = gender;" + "ctx._source.price = price;" + "ctx._source.currency = currency;" + "ctx._source.likes = likes;" + "ctx._source.comments = comments;" + "ctx._source.tags = tags;" + "ctx._source.status = status;" + "ctx._source.style = style;" + "ctx._source.rating = rating", params={ 'listing': self.id, 'artist': self.get_artist_id(), 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }) super(Listing, self).save(force_insert, force_update) else: super(Listing, self).save(force_insert, force_update) location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.index('glamazer', 'modelresult', { 'listing_id': self.id, 'artist_id': self.artist_id, 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon_id': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }, id='listings.listing.{0}'.format(self.id)) es.refresh('glamazer')
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list( client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, **kwargs): q = {"query": {"bool": {}}} query_name = "should" q["query"]["bool"]["minimum_number_should_match"] = 1 kwargs.pop("qtype", "") placetokens = [ l.strip() for l in tokenizer.split(qkey) if l and l not in STOP_WORDS and l[-1] != '.' ] if placetokens: reduced_placename = u" ".join(placetokens[0:]) if len(placetokens[0]) < 3 and len( placetokens) > 1 and 3.0 / len(placetokens) >= .5: reduced_placename = u" ".join(placetokens[1:]) else: reduced_placename = qkey # print "qkey", qkey, "reduced", reduced_placename maincondition = [ { "bool": { "must": [{ "multi_match": { "query": qkey, "fields": ["name.raw^5", "asciiname^5", "alternatenames"], "operator": "and" } }, { "terms": { "featureClass": ["a", "p"] } }], } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "normalized_asciiname": { "value": qkey } } }, # {"term": {"alternatenames": {"value": qkey[1:]}}}, { "term": { "alternatenames": { "value": qkey } } }, # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)), { "multi_match": { "query": reduced_placename if 'fuzzy' in kwargs else unicode( unidecode(reduced_placename)), 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 10), "prefix_length": kwargs.pop("prefix_length", 1), 'operator': kwargs.pop("operator", "and"), "fields": [ "name^3", "asciiname^3", "alternatenames", "normalized_asciiname^3" ] } } ] q["query"]["bool"][query_name] = maincondition if kwargs: filter_cond = [] if 'min_popln' in kwargs: popln = kwargs.pop("min_popln") if popln is not None: filter_cond.append( {"range": { "population": { "gte": popln } }}) for key, val in kwargs.viewitems(): if not isinstance(val, basestring): val = list([(v) for v in val]) filter_cond.append({"terms": {key: val}}) else: filter_cond.append({"term": {key: (val)}}) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} q['from'] = 0 q['size'] = 50 return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: # print(max_score) gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def _oldquery(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def oldquery(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [ { "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": # {"featureCode": # ["pcli", "ppl", "ppla2", "adm3"]} { "featureClass": ["a", "h", "l", "t", "p", "v"] } } ] } }, "sort": { "population": "desc" } } if kwargs: for key in kwargs: q2['query']['bool']['filter'].append( {"term": { key: kwargs[key] }}) res = self.eserver.search( q2, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): ere = re.compile("[^\sa-zA-Z0-9]") with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].lower( ).split(",") row['normalized_asciiname'] = (re.sub( r'\s+', r' ', ere.sub("", row['asciiname']))).strip() cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue def remove_dynamic_stopwords(self, term): # cc = {} # ttl = 0 words = [w for t in term.split("-") for w in t.split() if len(w) > 1] if len(words) == 1: return term stopword_removed = "" for word in words: try: t = self.eserver.count(word)['count'] if t >= 20000: continue except: pass stopword_removed += (word + " ") # else: # print(term, "stopword ", word) return stopword_removed.strip()
class ElasticSearch(object): conn = None url = settings.ELASTICSEARCH_URL index_name = settings.ELASTICSEARCH_INDEX_NAME stdout = None stderr = None def __init__(self, index_name=None, stdout=None, stderr=None): self.conn = PyElasticSearch() if index_name: self.index_name = index_name if stdout: self.stdout = stdout if stderr: self.stderr = stderr def create_index(self, delete=True): if delete: try: self.conn.delete_index(self.index_name) except ElasticHttpNotFoundError as e: pass mappings = dict( (k, v) for k, v in get_elasticsearch_properties().items()) self.conn.create_index(self.index_name, settings={'mappings': mappings}) def index_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.index_activity(activity) def delete_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.delete_activity(activity) def index_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: docs = self.get_activity_documents(activity, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op( doc, id=doc.pop('id'), parent=doc.pop('_parent', None)) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_investor(self, investor): for doc_type in DOC_TYPES_INVESTOR: docs = self.get_investor_documents(investor, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_activity_documents(self, activity_identifiers=[]): activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter( fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_PENDING, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().values_list( 'activity_identifier', flat=True).distinct() for doc_type in DOC_TYPES_ACTIVITY: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i deals...' % (doc_type, len(activity_identifiers))) for activity_identifier in activity_identifiers: for activity in self.get_activity_versions( activity_identifier): docs.extend( self.get_activity_documents(activity, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: paginator = Paginator(docs, 1000) for page in paginator.page_range: try: self.conn.bulk( (self.conn.index_op(doc, id=doc.pop('id'), parent=doc.pop( '_parent', None)) for doc in paginator.page(page)), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % (error['index']['error'] ['caused_by']['type'], error['index']['error'] ['caused_by']['reason']) self.stderr and self.stderr.write(msg) self.conn.refresh() def index_investor_documents(self): investors = Investor.objects.public().order_by( 'investor_identifier', '-id').distinct('investor_identifier') for doc_type in DOC_TYPES_INVESTOR: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i investors...' % (doc_type, investors.count())) for investor in investors: docs.extend( self.get_investor_documents(investor, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) #def index_activity_by_version(self, activity_identifier): # for doc_type in get_elasticsearch_properties().keys(): # docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type) # if len(docs) > 0: # try: # self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), # index=self.index_name, # doc_type=doc_type) # except BulkError as e: # for error in e.errors: # stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % ( # error['index']['error']['type'], # error['index']['error']['reason'], # error['index']['error']['caused_by']['type'], # error['index']['error']['caused_by']['reason'], # error['index']['_id'] # )) def get_activity_versions(self, activity_identifier): versions = [] # get the newest non-pending, readable historic version: try: newest = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().latest() if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED: versions.append(newest) except HistoricalActivity.DoesNotExist: newest = None # get newer pendings pendings = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status_id=HistoricalActivity.STATUS_PENDING).distinct() if newest: pendings.filter(history_date__gt=newest.history_date) versions.extend(pendings) return versions def get_activity_documents(self, activity, doc_type='deal'): docs = [] deal_attrs = { 'id': activity.id, 'activity_identifier': activity.activity_identifier, 'historical_activity_id': activity.id, 'status': activity.fk_status_id, } # Todo: Is there a nice way to prevent this extra Activity query? # e.g. if we save is_public/deal_scope as ActivityAttributes public_activity = Activity.objects.filter( activity_identifier=activity.activity_identifier).order_by( '-id').first() if public_activity: deal_attrs.update({ 'is_public': public_activity.is_public, 'deal_scope': public_activity.deal_scope, 'deal_size': public_activity.deal_size, 'current_negotiation_status': public_activity.negotiation_status, 'top_investors': public_activity.top_investors, 'fully_updated_date': public_activity.fully_updated_date, }) else: # Fixme: This should not happen self.stderr and self.stderr.write( _('Missing activity for historical activity %i (Activity identifier: #%i)' % (activity.id, activity.activity_identifier))) #except Activity.MultipleObjectsReturned: # # Fixme: This should not happen # self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % ( # activity.id, # activity.activity_identifier # ))) for a in activity.attributes.select_related('fk_group__name').order_by( 'fk_group__name'): # do not include the django object id if a.name == 'id': continue attribute = None attribute_key = '%s_attr' % a.name if attribute_key in get_elasticsearch_properties( )['deal']['properties'].keys(): attribute = { 'value': a.value, 'value2': a.value2, 'date': a.date, 'is_current': a.is_current, } value = a.value # Area field? if a.name and 'area' in a.name and a.polygon is not None: # Get polygon #value = json.loads(a.polygon.json) # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work #value['type'] = 'multipolygon' value = a.polygon.json or '' # do not include empty values if value is None or value == '': continue # Doc types: location, data_source or contract group_match = a.fk_group and a.fk_group.name or '' group_match = re.match( '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)', group_match) if group_match: dt, count = group_match.groupdict()['doc_type'], int( group_match.groupdict()['count']) if doc_type == dt: while len(docs) < count: docs.append({ '_parent': activity.activity_identifier, 'id': a.id, #'%i_%i' % (a.id, count), }) docs[count - 1][a.name] = [ value, ] # Set doc type counter within deal doc type (for location/data_source/contract) elif doc_type == 'deal': # Set counter key = '%s_count' % dt if key not in deal_attrs.keys(): deal_attrs[key] = count elif deal_attrs[key] < count: deal_attrs[key] = count # Create list with correct length to ensure formset values have the same index if not a.name in deal_attrs: deal_attrs[a.name] = [''] * count if attribute: deal_attrs[attribute_key] = [''] * count else: while len(deal_attrs[a.name]) < count: deal_attrs[a.name].append('') if attribute: deal_attrs[attribute_key].append('') deal_attrs[a.name][count - 1] = value if attribute: deal_attrs['%s_attr' % a.name][count - 1] = attribute # Doc type: deal and not formset elif doc_type == 'deal': if a.name in deal_attrs: deal_attrs[a.name].append(value) if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name].append(attribute) else: deal_attrs[a.name] = [ value, ] if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name] = [ attribute, ] if doc_type == 'deal': # Additionally save operational company attributes oc = Investor.objects.filter( investoractivityinvolvement__fk_activity__activity_identifier= activity.activity_identifier) if oc.count() > 0: oc = oc.first() for field in Investor._meta.fields: if isinstance(field, ForeignKey): deal_attrs['operational_company_%s' % field.name] = getattr( oc, '%s_id' % field.name) else: deal_attrs['operational_company_%s' % field.name] = getattr(oc, field.name) else: pass #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier) # Create single document for each location # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now? spatial_names = list(get_spatial_properties()) for i in range(deal_attrs.get('location_count', 0)): doc = deal_attrs.copy() for name in spatial_names: if not name in doc: continue if len(deal_attrs[name]) > i: doc[name] = deal_attrs[name][i] else: doc[name] = '' # Set unique ID for location (deals can have multiple locations) doc['id'] = '%s_%i' % (doc['id'], i) point_lat = doc.get('point_lat', None) point_lon = doc.get('point_lon', None) if point_lat and point_lon: # Parse values try: parsed_lat, parsed_lon = float(point_lat), float(point_lon) doc['geo_point'] = '%s,%s' % (point_lat, point_lon) except ValueError: doc['geo_point'] = '0,0' else: doc['point_lat'] = '0' doc['point_lon'] = '0' doc['geo_point'] = '0,0' # FIXME: we dont really need 'point_lat' and 'point_lon' here, # so we should pop them from doc when adding 'geo_point' docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def get_export_properties(self, doc, doc_type='deal'): if doc_type == 'investor': return ExportInvestorForm.export(doc) elif doc_type == 'involvement': return InvestorVentureInvolvementForm.export(doc) else: properties = { 'deal_scope_export': doc.get('deal_scope', ''), 'is_public_export': doc.get('is_public', False) and str(_('Yes')) or str(_('No')), 'deal_size_export': doc.get('deal_size', ''), 'current_negotiation_status_export': doc.get('current_negotiation_status', ''), 'top_investors_export': doc.get('top_investors', ''), 'fully_updated_date_export': doc.get('fully_updated_date', ''), } # Doc types: deal, location, contract and data_source for form in ChangeDealView.FORMS: formset_name = hasattr(form, "form") and form.Meta.name or None form = formset_name and form.form or form properties.update(form.export(doc, formset=formset_name)) properties.update( ExportInvestorForm.export(doc, prefix='operational_company_')) return properties def get_investor_documents(self, investor, doc_type='investor'): docs = [] # Doc types: involvement and investor if doc_type == 'involvement': ivis = InvestorVentureInvolvement.objects.filter( Q(fk_venture=investor) | Q(fk_investor=investor)) for ivi in ivis: doc = {} for field in ivi._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(ivi, '%s_id' % field.name) else: doc[field.name] = getattr(ivi, field.name) docs.append(doc) elif doc_type == 'investor': doc = {} for field in investor._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(investor, '%s_id' % field.name) else: doc[field.name] = getattr(investor, field.name) docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def refresh_index(self): self.conn.refresh(self.index_name) def search(self, elasticsearch_query, doc_type='deal', sort=[]): """ Executes paginated queries until all results have been retrieved. @return: The full list of hits. """ start = 0 size = 10000 # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better) raw_result_list = [] done = False while not done: query = { 'query': elasticsearch_query, 'from': start, 'size': size, } if sort: query['sort'] = sort query_result = self.conn.search(query, index=self.index_name, doc_type=doc_type) raw_result_list.extend(query_result['hits']['hits']) results_total = query_result['hits']['total'] if len(raw_result_list) >= results_total: done = True else: start = len(raw_result_list) print('\nElasticsearch returned %i documents from a total of %i \n\n' % (len(raw_result_list), query_result['hits']['total'])) return raw_result_list def delete_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: try: if doc_type == 'deal': self.conn.delete(id=activity.activity_identifier, index=self.index_name, doc_type=doc_type) else: self.conn.delete_by_query(query={ "parent_id": { "type": "deal", "id": str(activity.activity_identifier), } }, index=self.index_name, doc_type=doc_type) except ElasticHttpNotFoundError as e: pass def get_deals_by_activity_identifier(self, activity_identifier, doc_type='deal'): return self.search({ "constant_score": { "filter": { "term": { "activity_identifier": activity_identifier } } } })
def save(self, force_insert=False, force_update=False, **kwargs): es = ElasticSearch(ELASTIC_SEARCH_URL) if self.id: location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.update( 'glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id), script="ctx._source.listing_id = listing;" + "ctx._source.artist_id = artist;" + "ctx._source.artist_avatar = artist_avatar;" + "ctx._source.artist_name = artist_name;" + "ctx._source.salon_id = salon;" + "ctx._source.salon_avatar = salon_avatar;" + "ctx._source.salon_name = salon_name;" + "ctx._source.title = title;" + "ctx._source.location = location;" + "ctx._source.description = description;" + "ctx._source.get_picture = get_picture;" + "ctx._source.metadata = metadata;" + "ctx._source.gender = gender;" + "ctx._source.price = price;" + "ctx._source.currency = currency;" + "ctx._source.likes = likes;" + "ctx._source.comments = comments;" + "ctx._source.tags = tags;" + "ctx._source.status = status;" + "ctx._source.style = style;" + "ctx._source.rating = rating", params={ 'listing': self.id, 'artist': self.get_artist_id(), 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }) super(Listing, self).save(force_insert, force_update) else: super(Listing, self).save(force_insert, force_update) location = self.get_location() location_es = "{0},{1}".format(location.y, location.x) es.index('glamazer', 'modelresult', { 'listing_id': self.id, 'artist_id': self.artist_id, 'artist_avatar': self.get_artist_avatar(), 'artist_name': self.get_artist_name(), 'salon_id': self.get_salon_id(), 'salon_avatar': self.get_salon_avatar(), 'salon_name': self.get_salon_name(), 'title': self.title, 'location': location_es, 'description': self.description, 'get_picture': self.get_picture(), 'metadata': self.metadata, 'gender': self.gender, 'price': self.price, 'currency': self.currency, 'likes': self.likes, 'comments': self.comments, 'tags': self.get_tags(), 'status': self.status, 'style': self.get_style(), 'rating': self.get_rating() }, id='listings.listing.{0}'.format(self.id)) es.refresh('glamazer')
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] #maincondition["match"] = {"name.raw": {"query": qkey}} if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [{ "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": { "featureCode": ["pcli", "ppl", "ppla2", "adm3"] } }] } }, "sort": { "population": "desc" } } res = self.eserver.search(q2, index=self._index, doc_type=self._doctype, **kwargs)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].split(",") cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue
class ESWrapper(BaseDB): def __init__(self, index_name, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) self._base_query = { "query": { "bool": { "must": { "match": { "name.raw": "" } } } } } self._geo_filter = { "geo_distance": { "distance": "20km", "coordinates": {} } } self._index = index_name self._doctype = "places" def query(self, qkey, qtype="exact"): """ qtype values are exact, relaxed or geo_distance """ q = self._base_query.copy() if qtype == "exact": q["query"]["bool"]["must"]["match"]["name.raw"] = qkey elif qtype == "relaxed": q["query"]["bool"]["must"]["match"]["name"] = qkey q["query"]["bool"]["must"]["match"].pop("name.raw") elif qtype == "geo_distance": q = { "query": { "bool": { "must": { "match_all": {} } }, "filter": { "geo_distance": { "distance": "20km", "coordinates": qkey } } } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def near_geo(self, geo_point): q = { "query": { "bool": { "must": { "match_all": {} } }, "filter": self._geo_filter } } q["query"]["bool"]["geo_distance"]["coordinates"] = geo_point return self.eserver.search(q, index=self._index, doc_type=self._doctype) def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) self.eserver.create_index(index='geonames', settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index='geonames', doc_type='places') print "..", self.eserver.refresh('geonames') def _opLoader(self, datacsv, confDir): with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] del (row['latitude']) del (row['longitude']) row['alternatenames'] = row['alternatenames'].split(",") cnt += 1 #if cnt > 100: #break yield self.eserver.index_op(row, index="geonames", doc_type="places")