def save_chambana(): es = Elasticsearch() query1 = {"query": {"match": {"city": "Urbana" } } } query2 = {"query": {"match": {"city": "Champaign" } } } res1 = es.search(index="business", body=query1, size=300) res2 = es.search(index="business", body=query2, size=400) print("Got %d Hits:" % res1['hits']['total']) print("Got %d Hits:" % res2['hits']['total']) urban = open('urban.json','w') champ = open('champ.json','w') template = { "create": { "_index": "urban", "_type": "doc"} } for hit in res1['hits']['hits']: json.dump(template,urban) urban.write("\n") json.dump(hit['_source'],urban) urban.write('\n') template = { "create": { "_index": "champ", "_type": "doc"} } for hit in res2['hits']['hits']: json.dump(template,champ) champ.write("\n") json.dump(hit['_source'],champ) champ.write('\n')
def get_judge_res(judge_image_dir): es = Elasticsearch(esport) judge_image_dir = 'judgeresult:' + judge_image_dir search_size = 20 search_offset = 0 print request.args try: if 'offset' in request.args: search_offset = int(request.args.get('offset')) if 'size' in request.args: search_size = int(request.args.get('size')) res_index = es.search( index = judge_image_dir, size = search_size, from_=search_offset ) except: del(es) return 'Error: index do not exist\n' res_lst = [] for item in res_index['hits']['hits']: res_lst.append(item['_source']['file']) res_dict = { 'total' : res_index['hits']['total'], 'file_list' : res_lst, 'from_' : search_offset, 'size' : len(res_index['hits']['hits']) } json_res = json.dumps(res_dict) del(es) return json_res
def reindex(old_index, new_index, s): ''' Function to reindex by scan and scroll combined with a bulk insert. old_index is the index to take docs from, new_index is the one the docs go to. s is the size of each bulk insert - should set this as high as the RAM on the machine you run it on allows. 500-1000 seems reasonable for t2.medium ''' def create_bulk_insert_string(results, index): ret_str = '' for hit in results: ret_str += '{"create":{"_index":"' + index + '","_type":"variant","_id":"' + hit['_id'] + '"}}\n' ret_str += json.dumps(hit) + '\n' return ret_str es = Elasticsearch('localhost:9200') s = es.search(index=old_index, body='{"query": {"match_all": {}}}', search_type='scan', scroll='5m', size=s) curr_done = 0 try: while True: # do this loop until failure r = es.scroll(s['_scroll_id'], scroll='5m') this_l = [res['_source'] for res in r['hits']['hits']] this_str = create_bulk_insert_string(this_l, new_index) es.bulk(body=this_str, index=new_index, doc_type='variant') curr_done += len(this_l) except: print('{} documents inserted'.format(curr_done))
def es_search(self,p_host,p_port,p_index,p_query): """ Returns a query result from elastic search The result is the response from elasticsearch as a dictionnary. {p_host} Elasticsearch server\n {p_port} Port of the es server\n {p_index} Name of the index to query\n {p_query} Query to run\n | ${res} = | es search | localhost | 9200 | myIndex | {"query":{"query_string":{"query": "searched value"}}} | """ # Es client try: param = [{'host':p_host,'port':int(p_port)}] es = Elasticsearch(param) except Exception: raise AssertionError("Connexion error on %s:%i",p_host,int(p_port)) try: documents = es.search(body=p_query, index=p_index) except Exception: raise AssertionError("Search error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query) return documents
def count(self, p_index, p_query={}): """Gets the number of docs for a query p_index: elasticsearch index where to query p_query: the query to process return the number of docs from the index p_index and the query p_query """ try: param = [{'host': self.host, 'port': self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: result = es.count(index=p_index, body=p_query) logger.info('Count the number of items from %s for the query %s', p_index, p_query) except Exception as e: logger.error('Error querying the index %s with query %s', p_index, p_query) logger.error(e) return result['count']
class ElasticStorage(BaseStorage): def __init__(self, config): if not Elasticsearch: raise ImportError("elasticsearch-py is required to use Elasticsearch as storage.") if not Search: raise ImportError("elasticsearch_dsl is required to use Elasticsearch as storage.") self.name = 'elasticsearch' self.storage = Elasticsearch(**config) def keys(self, pattern="*"): return self.storage.keys(pattern) def set_val(self, key, val): body = { 'key': key, 'val': ','.join(map(str, val[0])), 'extra': str(val[1]) } self.storage.index(index='sift', doc_type='sift', body=body) def get_val(self, key): s = Search(using=self.storage, index='sift') return s.filter('term', key=key).execute().hits.hits def append_val(self, key, val): self.set_val(key, val) def get_list(self, key): return self.get_val(key)
def show(ctx, path, order): router = Router(open(ctx.obj['CONFIG'])) route = router.match(path) logging.debug("Matched route: %s" % route) if not route: print 'No queries matched' return es = Elasticsearch(hosts=route.get('elasticsearch_url')) request_body = {} for non_mandatory_key in ['sort', 'query']: value = route.get(non_mandatory_key) if value: request_body[non_mandatory_key] = value if order == 'asc': request_body['sort'] = {'@timestamp': 'asc'} elif order == 'desc': request_body['sort'] = {'@timestamp': 'desc'} elif order: click.echo("Unknown order format: %s" % order, err=True) return 1 logging.debug("Query: %s" % (request_body,)) result = es.search(index=route.get('index'), doc_type=None, body=request_body) hits = result['hits']['hits'] template = Template(route.get("format", "{{ __at_timestamp }} {{ message }}")) for hit in hits: doc = hit['_source'] doc['__at_timestamp'] = doc.get('@timestamp') print template.render(doc)
def iter_elastic_query(instance, index, field, subfield=None): es = Elasticsearch(instance) # initial search resp = es.search(index, body={"query": {"match_all": {}}}, scroll='5m') scroll_id = resp.get('_scroll_id') if scroll_id is None: return first_run = True while True: for hit in resp['hits']['hits']: s = hit['_source'] try: if subfield is not None: print(s[field][subfield]) yield s[field][subfield] else: yield s[field] except ValueError: logging.warning("Unable to process row: %s" % str(hit)) scroll_id = resp.get('_scroll_id') # end of scroll if scroll_id is None or not resp['hits']['hits']: break
def main(): beanstalk = beanstalkc.Connection(host=MYHOST, port=11301) es = Elasticsearch() # ignore 400 cause by IndexAlreadyExistsException when creating an index es.indices.create(index='grmoto', ignore=400) # ignore 404 and 400 es.indices.delete(index='grmoto', ignore=[400, 404]) try: while True: # To receive a job: job = beanstalk.reserve() # if job.body == 'quit': # print 'The agent shutting down' # break # Work with the job: obj = json.loads(job.body) #print json.dumps(obj, sort_keys=True, indent=4, separators=(',',': ')) #use elastic search res = es.index(index='grmoto', doc_type='native_objects', body = obj) print(res['created']) #Release the job job.delete() except: again()
def GET(self): es = Elasticsearch(conf['fulltext']['serviceUrl']) if web.input(wildcard_query=None).wildcard_query: query = { "wildcard": { "_all": web.input().query } } self.set_wildcard_query(True) else: query = { "multi_match": { "query": web.input().query, "operator": "and", "fields": ["text", "pageName", "tags"] } } self.set_wildcard_query(False) res = es.search(index=conf['fulltext']['indexName'], body={"query": query, "fields": ["pageName", "path", "fsPath", "text"]}) rows = [] for a in res['hits']['hits']: fields = a['fields'] fs_path = os.path.normpath('%s/%s.md' % (self.data_dir, fields['path'][0])) page_chapters, h1 = extract_description(fs_path) rows.append({ 'h1': h1 if h1 else fields['path'][0], 'file': fields['path'][0], 'chapters': page_chapters }) values = dict(query=web.input().query, ans=rows) return self._render('search.html', values)
def search(query='', field='q1', _operator='and', sort=[('_score', 'desc'), ('quoted_by', 'desc')], _filter={}, size=1000, _id=False): es = Elasticsearch([elasticsearch_setting]) if query: es_query = { 'match': { field: { 'query': query, 'operator': _operator, 'minimum_should_match': '85%' } } } else: es_query = {"match_all": {}} body = { "query": { "filtered": { "query": es_query, "filter": _filter } }, 'size': size } sort_item = _build_sort(sort) if sort_item: body.update({'sort': sort_item}) logger.debug(body) result = es.search(index='qwerty', body=body, _source=True, timeout=55) if _id: return (x for x in result['hits']['hits']) return (x['_source'] for x in result['hits']['hits'])
def search_index(index, searchdict, start=0, host='127.0.0.1', port=9200): # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html ''' #print searchdict pprint(searchdict) #import pdb; pdb.set_trace() thisurl = 'http://%s:%s/%s/_search' % (host, port, path) r = requests.get(thisurl, data=json.dumps(searchdict), verify=False) print r.reason ''' maxcount = 10000 es = Elasticsearch() res = es.search(index=index, body=searchdict, size=maxcount, scroll='1m') # hits.total is the total count of matches, but not the amount returned #total = res['hits']['total'] scroll = es.scroll(scroll_id=res['_scroll_id']) res['hits']['hits'] += scroll['hits']['hits'] return res
def do_POST(self): global csvPath try: content_len = int(self.headers.getheader('content-length', 0)) body = json.loads(self.rfile.read(content_len)) dict = {"url" : body['url'], "text" : body['text']} es = Elasticsearch() es.index(index="articles", doc_type="article", body=dict) with open(csvPath,'ab') as fout: writer = UnicodeWriter(fout,quoting=csv.QUOTE_ALL) writer.writerow(dict.values()) self.send_response(200) self.send_header("Content-type", "application/json") self.end_headers() self.wfile.write( json.dumps({"result":True}) ) except Exception, e: exc_type, exc_obj, exc_tb = sys.exc_info() print(" Type: %s | File: %s | Line number: %s " % (exc_type, os.path.abspath(__file__), exc_tb.tb_lineno)) print e.message self.send_response(500) self.send_header("Content-type", "application/json") self.end_headers() self.wfile.write( json.dumps({"result":False}) )
def hit_es( threadNum, times): #connect to our cluster es = Elasticsearch([{'host': host_es, 'port': 9200}]) for i in range(hits_per_thread): if i%report_time==0: print "On the way! "+ str(i)+" queries done!" while True: try: result = es.search( index= index_name, body=query, analyze_wildcard = 'true' , timeout = timeout_value) except: print "Connection time-out occured. Consider a bigger time-out limit" time_outs = time_outs + 1 continue break #print finish_time real_time = result['took'] #print real_time times.append(real_time) #print result['hits']['total'] print "Thread " + str(threadNum) + " finished... \n\n\n"
class IndexTalks: index_name = 'gc' doc_type = 'talk' def __init__(self): self.ft = FetchTalks() self.es = Elasticsearch() self.es_id_seq = 0 self.confId = '' def _FetchIndividualTalk(self, url): return urllib.request.urlopen(url) def FetchTalksAndIndexThem(self, weekendUrl): self.confId, talkUrls = self.ft.FetchTalks(weekendUrl) print(str.format('confId: {}, num talk urls: {}', self.confId, len(talkUrls))) for url in talkUrls: handle = self._FetchIndividualTalk(url) self._InsertOneTalkIntoES(handle, url) def _GetNextId(self): result = self.es_id_seq self.es_id_seq = self.es_id_seq + 1 return result def _GetTitleAndAuthor(self, line, tag, tagIndex): titleString = HtmlTagParser.GetTagContents(tag, line, tagIndex) print('title string: ' + titleString) titleSegments = titleString.split('-') title = titleSegments[0].strip() author = titleSegments[1].strip() if author.find('By') == 0: author = author[3:].strip() return ( title, author ) def _GetTitleAuthorContent(self, talkHandle): title = '' author = '' titleOpenTag = '<title>' titleFound = False talkContent = '' for line in talkHandle: #strLine = str(line) strLine = line.decode() talkContent = talkContent + strLine if titleFound == False: titleIndex = strLine.find(titleOpenTag) if titleIndex != -1: title, author = self._GetTitleAndAuthor(strLine, titleOpenTag, titleIndex) titleFound = True return ( title, author, talkContent ) def _InsertOneTalkIntoES(self, talkHandle, talkUrl): title, author, talkContent = self._GetTitleAuthorContent(talkHandle) idnum = self._GetNextId() idNumStr = str(idnum) print('indexing doc num: ' + idNumStr) json_body = json.dumps({'talkSortId': idNumStr, 'title': title, 'author': author, 'confid': self.confId, 'content': talkContent, 'url': talkUrl}) self.es.index(index=self.index_name, doc_type=self.doc_type, id=idnum, body=json_body)
def query_elastic(string): es = Elasticsearch() res = es.search(index="documents_analyzed", doc_type="articles", body={"query": {"match": {"_all": string}}}) tamano=res['hits']['total'] res = es.search(index="documents_analyzed", doc_type="articles", body={"size" : tamano,"query": {"match": {"_all": string}},"sort": { "date": { "order": "desc" }}}) res['hits']['hits']#este es un json con los datos de 0 a n return res['hits']['hits']
def query_and_dump_reults(args): es = Elasticsearch([args.hostname + ':' + str(args.port)]) query = '{"query":{"match_all":{}}}' if args.query is not None: query = args.query doc_type = None if args.doc_type is not None: doc_type = args.doc_type target = "output.csv" if args.target is not None: target = args.target res = es.count(index=args.index, body=query) nhits = res['count'] counter = 0 bar = progressbar.ProgressBar(max_value=nhits) res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type) fields = args.fields.split(',') with open(target, 'w') as csvfile: datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) datawriter.writerow(fields) for item in res: item = item['_source'] datawriter.writerow([get_var(item, field) for field in fields]) counter += 1 bar.update(counter) bar.finish()
def loadNerOutputs(anndir): #setIds = ["Citalopram-4259d9b1-de34-43a4-85a8-41dd214e9177","Escitalopram-13bb8267-1cab-43e5-acae-55a4d957630a","Fluoxetine-5f356c1b-96bd-4ef1-960c-91cf4905e6b1"] #setIds = ["55816042-946d-4bec-9461-bd998628ff45","c00d1607-ac36-457b-a34b-75ad74f9cf0a","70b079e2-a1f7-4a93-8685-d60a4d7c1280","38642D80-AAA6-4196-A033-3977FF35B48A"] ann_ner = loadJsonFromDir(anndir) #print len(ann_ner) #idx = 1 for ann in ann_ner: #if ann["setId"] in setIds: dict_paras = parseSingleResource(ann) ann_domeo = buildAnnotation(dict_paras, SAMPLE_DOMEO) # load all annotations if ann_domeo: # load 11 - 208 #if ann_domeo and (int(dict_paras["fileId"]) > 10): es = Elasticsearch() es.index(index="domeo", doc_type=COLLECTION, id=dict_paras["mongo_uuid"], body=json.dumps(ann_domeo)) insert_annotation(dict_paras) print "[INFO] load annotations:" +str(ann["setId"]) #print "load annotations for " + dict_paras["annotates_url"] #idx = idx + 1 else: print "[ERROR] annotation empty"
def loadDatainES(filename, index, doctype,dataFileType,hostname="localhost",port=9200,mappingFilePath=None,username="",password="",protocol="http"): try: print "Connecting to " + hostname + " at port:" + str(port) # es = Elasticsearch([{'host': hostname, 'port': port}]) if username != "" and password != "": es = Elasticsearch([protocol + '://' + username + ':' + password + '@'+hostname + ":" + str(port)],show_ssl_warnings=False) else: es = Elasticsearch([protocol + '://'+hostname + ":" + str(port)],show_ssl_warnings=False) if mappingFilePath: with open(mappingFilePath) as m: mapping = m.read() #print "Mapping file:" + mapping es.indices.create(index=index, body=mapping,ignore=400) if dataFileType=="1": with open(filename) as f: d = json.load(f) for wp in d: res = es.index(index=index,doc_type=doctype,body=wp,id=wp["uri"]) print "indexing id: " + res["_id"] + " for uri: " + wp["uri"] elif dataFileType == "0": with open(filename) as f: lines = f.readlines() for line in lines: if line.strip() != "": jsonurlobj = json.loads(line.strip()) objkey = jsonurlobj['uri'] res = es.index(index=index,doc_type=doctype,body=line) print "indexing id: " + res["_id"] + " for uri: " + objkey except Exception, e: print >> stderr.write('ERROR: %s\n' % str(e))
def shipCurToElastic(es_tag, out_dict, deltaSecs): host = 'elastic2' es = Elasticsearch([{'host': host}]) indexBase = 'eventstore' ix = curIndexName(indexBase) id = out_dict['@timestamp'] if 0: print( ix, "stats", id, #42, # use timestamp. out_dict #{"any": "data", "timestamp": datetime.datetime.now()} ) else: try: w = es.index( index=ix, doc_type="stats", id=id, #42, # use timestamp. body=out_dict #{"any": "data", "timestamp": datetime.datetime.now()} ) print(es_tag, 'delta:', deltaSecs, 'write:', w) except elasticsearch.exceptions.ConnectionTimeout as e: print("couldnt ship", e)
class ElasticSearchManager(object): def __init__(self, index=None, doc_type=None, *args, **kwargs): self.index = index self.doc_type = doc_type self.obj_es = Elasticsearch() def search(self, query = None, *args, **kwargs): data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}}) return fetch_source(data['hits']['hits']) def get(self, *args, **kwargs): data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id']) return data['_source'] def get_list(self, *args, **kwargs): data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}}) return fetch_source(data['hits']['hits']) def insert(self, data = None): data = json.loads(data) data['user_name'] = data['user']['screen_name'] del data['user'] del data['entities'] res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data) logger.info("Getting stream:{0}".format(res)) def delete(self, data = None): pass def update(self, data = None): pass
def search(request): if request.method == 'POST': data = request.POST if not data: return _error_response(request, "Failed. No query received") query = data['query'] es = Elasticsearch(['es']) result = es.search(index='listing_index', body={'query': {'query_string': {'query': query}}}) courses_data = result['hits']['hits'] courses_list = [] for c in courses_data: course = {} course['name'] = c['_source']['name'] course['pk'] = c['_source']['pk'] course['description'] = c['_source']['description'] courses_list.append(course) #return a list dictionary (each dictionary is a course) return JsonResponse(courses_list, safe=False) else: es = Elasticsearch(['es']) result = es.search(index='listing_index', body={'query': {'query_string': {'query': 'calculus'}}, 'size': 10}) courses_data = result['hits']['hits'] courses_list = [] for c in courses_data: course = {} course['name'] = c['_source']['name'] course['pk'] = c['_source']['pk'] course['description'] = c['_source']['description'] courses_list.append(course) return JsonResponse(result, safe=False) return JsonResponse({'work': True, 'resp': courses_list}, safe=False)
def searchThroughSearchBar(hotel_id, query): '''This function is used for search reviews of a hotel by query. hotel_id is the hotel_id in hotels.db query is input query from review search bar. ''' es = Elasticsearch() indexName = "reviews_es_index" doc_type = "review" query_body = { "query": { "bool": { "must": [], "should":[] } }, "highlight":{ "pre_tags":['<em style="background-color:yellow">'], "post_tags":["</em>"], "fields":{"content":{"fragment_size": 500}} }} query_body["query"]["bool"]["must"].append({"match":{"hotel_id":hotel_id}}) query_body["query"]["bool"]["should"].append({"match":{"content":query}}) query_body["query"]["bool"]["should"].append({"match":{"title":query}}) res = es.search(indexName, body=query_body) res = res["hits"]["hits"] for i in range(len(res)): res[i] = res[i]["_source"] return res
def searchDocument(id): es = Elasticsearch( ['https://cdr-es.istresearch.com:9200/memex-qpr-cp4-2'], http_auth=('cdr-memex', '5OaYUNBhjO68O7Pn'), port=9200, use_ssl=True, verify_certs = True, ca_certs=certifi.where(), ) query_body = { "query":{ "bool": { "must": { "match": { "_id":id } } } } } response = es.search(body=query_body,request_timeout=60) document = response["hits"]["hits"] if document: return document[0] else: return document
def get_event_location(placeid): _query = { "query": { "bool": { "must": [ ] } } } _query["query"]["bool"]["must"].append({"match": {"placeid": "%s" % placeid}}) search_query = json.dumps(_query) es = Elasticsearch(hosts = [ES_HOST]) res = es.search(index="place", size=10, body=search_query) resp = [] for hit in res['hits']['hits']: resp.append(hit["_source"]) lat=-1.0 lon=-1.0 if resp and len(resp) >= 1: try: lat = resp[0].get('location').get('lat') lon = resp[0].get('location').get('lon') except: print "failed to get lat, lon" return lat, lon
def api_get_all(request, page): page = int(page) es = Elasticsearch() allowed_media_types = [Media.BOOK, Media.AUDIOTALK, Media.VIDEOTALK, Media.PODCAST, Media.MOVIE, Media.MAGAZINE] if 'media_types' in request.GET: # split string "BK,AT," into array at "," and remove empty itemes allowed_media_types = filter(None,request.GET['media_types'].split(',')) body = { "from" : (page - 1) * settings.MAX_ITEM_COUNT, "size" : settings.MAX_ITEM_COUNT, # "query": { # "match_all": {} # } "query" : { "filtered" : { "query" : { "match_all" : {} }, "filter" : { "terms" : { "media_type" : allowed_media_types} } } } } res = es.search(index='bibliotheca', body=body) response_data = prepare_search_response(res, page) return HttpResponse(json.dumps(response_data), content_type="application/json")
def report_to_elastic(self, file: FileArchive): config = file.config fullfilename_ftp = file.to.path.replace("\\\\diskstation", '').replace('\\', '/') dict = { "ext": file.to.get_extension(), # 'jpg' "volume": "/volume2", # "/Camera/Foscam/FI9805W_C4D6553DECE1/snap/MDAlarm_20190201-124005.jpg", "path": fullfilename_ftp, "@timestamp": file.to.get_timestamp_utc(), # "2019-02-01T11:40:05.000Z", "doc": "event", "sensor": config.sensor, "position": config.position, "camera": config.camera, "value": file.to.size(), "tags": [ "synology_cameraarchive", "python_camera_archiver" ] } json_data = json.dumps(dict, indent=4, sort_keys=True) #print('{}@{}'.format(config.camera, file.to.get_timestamp_utc()), json_data) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) res = es.index(index="cameraarchive-" + file.to.get_month_id_utc(), doc_type='doc', body=json_data, id='{}@{}'.format(config.camera, file.to.get_timestamp_utc()))
def searchThroughTags(hotel_id, only_overall=True, tags=["cleanliness", "service", "value", "location", "sleep_quality", "rooms"]): '''This function is used for search reviews by clicking tags. It will return two class of reviews, one is high score reviews and the other is low score reviews. If the only_overall flag set to True, the function will return high over score reviews and low overall score reviews. If the flag set to false, the function will return two classes of result based on the scores of each field in the tags. ''' es = Elasticsearch() indexName = "reviews_es_index" doc_type = "review" query_body1 = getQueryTemplate(hotel_id) query_body2 = getQueryTemplate(hotel_id) if only_overall: query_body1["query"]["bool"]["must"].append({"range":{"ratings.overall":{"gt":3.0}}}) query_body2["query"]["bool"]["must"].append({"range":{"ratings.overall":{"lt":3.0}}}) else: query_body1["query"]["bool"]["must_not"] = {} query_body2["query"]["bool"]["must_not"] = {} query_body1["query"]["bool"]["must_not"]["range"] = {} query_body2["query"]["bool"]["must_not"]["range"] = {} for tag in tags: query_body1["query"]["bool"]["should"].append({"range":{"ratings."+tag:{"gt":3.0}}}) query_body1["query"]["bool"]["must_not"]["range"]["ratings."+tag] = {"lt":3.0} query_body2["query"]["bool"]["should"].append({"range":{"ratings."+tag:{"lt":3.0}}}) query_body2["query"]["bool"]["must_not"]["range"]["ratings."+tag] = {"gt":3.0} res1 = es.search(indexName, body=query_body1)["hits"]["hits"] res2 = es.search(indexName, body=query_body2)["hits"]["hits"] for i in range(len(res1)): res1[i] = res1[i]["_source"] for i in range(len(res2)): res2[i] = res2[i]["_source"] return (res1, res2)
def es_count(self,p_host,p_port,p_index,p_query=None): """ Returns the number of documents that match a query The result is the response from elastic search. The value is in the "count" field of the response. {p_host} Elasticsearch server\n {p_port} Port of the es server\n {p_index} Name of the index to query\n {p_query} Query to run\n | ${res} = | es count | localhost | 9200 | myIndex | {"query":{"query_string":{"query": "searched value"}}} | ${res} contains the number of docs """ # Es client try: param = [{'host':p_host,'port':int(p_port)}] es = Elasticsearch(param) except Exception: raise AssertionError("Connexion error on %s:%i",p_host,int(p_port)) try: result = es.count(index=p_index, body=p_query) except Exception: raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query) return result['count']
def createIndex(): """This endpoint should be used to index pages of a Mouchak installation. FIXME: - Endpoint is only accessible from the index page of search service. - Does not support cross origin requests. - Better name for the function. """ es = Elasticsearch() if not es.indices.exists(urlparse(request.form['url']).netloc): url = request.form['url'] if not request.form['url'].endswith('/'): url = request.form['url'] + '/' try: contents = requests.get(url + "pages").json() for content in contents: es.index(index=urlparse(request.form['url']).netloc, doc_type="html", body=content, id=content['id']) response = make_response() response.data = "Website indexed." return response except: response = make_response() response.status_code = 204 return response else: response = make_response() response.status_code = 409 response.data = {"reason": "Index already exists"} return response
def main(args): # Specify the arguments. parser = argparse.ArgumentParser( description='''A tool for finding when a job was running through use of the big data store.''') parser.add_argument( '-a', '--allocationid', metavar='int', dest='allocation_id', default=-1, help='The allocation ID of the job.') parser.add_argument( '-j', '--jobid', metavar='int', dest='job_id', default=-1, help='The job ID of the job.') parser.add_argument( '-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0, help='The secondary job ID of the job (default : 0).') parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, help='An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".') parser.add_argument( '-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None, help='A list of hostnames to filter the results to ') args = parser.parse_args() # If the target wasn't specified check the environment for the target value, printing help on failure. if args.target == None: if TARGET_ENV in os.environ: args.target = os.environ[TARGET_ENV] else: parser.print_help() print("Missing target, '%s' was not set." % TARGET_ENV) return 2 # Open a connection to the elastic cluster, if this fails is wrong on the server. es = Elasticsearch( args.target, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60 ) # Build the query to get the time range. should_query='{{"query":{{"bool":{{ "should":[{0}] {1} }} }} }}' match_clause= '{{"match":{{"{0}":{1} }} }}' if args.allocation_id > 0 : tr_query = should_query.format( match_clause.format("data.allocation_id", args.allocation_id), "") else : tr_query = should_query.format( "{0},{1}".format( match_clause.format("data.primary_job_id", args.job_id ), match_clause.format("data.secondary_job_id", args.job_id_secondary )), ',"minimum_should_match" : 2' ) # Execute the query on the cast-allocation index. tr_res = es.search( index="cast-allocation", body=tr_query ) total_hits = tr_res["hits"]["total"] print("Found {0} matches for specified the job.".format(total_hits)) if total_hits != 1: print("This implementation only supports queries where the hit count is equal to 1.") return 3 # TODO make this code more fault tolerant hits= deep_get(tr_res, "hits", "hits") if len(hits) > 0 : tr_data = deep_get( hits[0], "_source", "data") date_format= '%Y-%m-%d %H:%M:%S.%f' print_format='%Y-%m-%d.%H:%M:%S:%f' search_format='"yyyy-MM-dd HH:mm:ss:SSS"' start_time=datetime.strptime(tr_data["begin_time"], '%Y-%m-%d %H:%M:%S.%f') start_time='{0}'.format(start_time.strftime(print_format)[:-3]) # If a history is present end_time is end_time, otherwise it's now. if "history" in tr_data: end_time=datetime.strptime(tr_data["history"]["end_time"], date_format) end_time='{0}'.format(end_time.strftime(print_format)[:-3]) else: end_time="Still Running" print( "\nAllocation ID: {0}".format(tr_data["allocation_id"])) print( "Job ID: {0} - {1}".format(tr_data["primary_job_id"], tr_data["secondary_job_id"])) print( "Start Time: {0} \n End Time: {1}\n".format(start_time, end_time))
class GetTbCon(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR, timeout=30) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR, timeout=30) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("tb_con") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] # _id = trans_md5(rowkey) boo = self.es.exists("tb_con", "sino", rowkey) if boo: cunzai = cunzai + 1 map = self.hbase_con.getResultByRowkey("TB_CON_TABLE", rowkey, "tb_con", param) if not map: continue action_list.append({ "_op_type": "update", "_index": "tb_con", "_type": "sino", "_id": rowkey, "doc": map, }) else: map = self.hbase_con.getResultByRowkey("TB_CON_TABLE", rowkey, "tb_con") if not map: continue action_list.append({ "_index": "tb_con", "_type": "sino", "_id": rowkey, "_source": map, }) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 if len(action_list) > 0: self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:tb_con,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
def __init__(self, host, cloud_id, login, password, api_key, posts): self.df = posts self.es = Elasticsearch(host, cloud_id=cloud_id, http_auth = (login, password), api_key=api_key, )
from elasticsearch import Elasticsearch from flask import Flask, jsonify es = Elasticsearch("192.168.59.129:9200") app = Flask(__name__) @app.route('/', methods=['get', 'post']) def index(): # 1,#查看所有的index # indexs = es.indices.get("*") # print(indexs) # print(indexs.keys()) # 2,添加index body = { "settings": { "number_of_shards": 3, "number_of_replicas": 1 }, "mappings": { "_doc": { 'properties': { 'tno': { 'type': 'keyword' }, 'tname': { 'type': "keyword" }, 'tsex': {
class Test(BaseTest): def init(self): self.elasticsearch_url = self.get_elasticsearch_url() print("Using elasticsearch: {}".format(self.elasticsearch_url)) self.es = Elasticsearch([self.elasticsearch_url]) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("elasticsearch").setLevel(logging.ERROR) self.modules_path = os.path.abspath(self.working_dir + "/../../../../module") self.filebeat = os.path.abspath(self.working_dir + "/../../../../filebeat.test") self.index_name = "test-filebeat-modules" body = {"transient": {"script.max_compilations_rate": "1000/1m"}} self.es.transport.perform_request('PUT', "/_cluster/settings", body=body) @parameterized.expand(load_fileset_test_cases) @unittest.skipIf( not INTEGRATION_TESTS, "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them." ) @unittest.skipIf( os.getenv("TESTING_ENVIRONMENT") == "2x", "integration test not available on 2.x") def test_fileset_file(self, module, fileset, test_file): self.init() # generate a minimal configuration cfgfile = os.path.join(self.working_dir, "filebeat.yml") self.render_config_template( template_name="filebeat_modules", output=cfgfile, index_name=self.index_name, elasticsearch_url=self.elasticsearch_url, ) self.run_on_file(module=module, fileset=fileset, test_file=test_file, cfgfile=cfgfile) def run_on_file(self, module, fileset, test_file, cfgfile): print("Testing {}/{} on {}".format(module, fileset, test_file)) try: self.es.indices.delete(index=self.index_name) except: pass self.wait_until(lambda: not self.es.indices.exists(self.index_name)) cmd = [ self.filebeat, "-systemTest", "-e", "-d", "*", "-once", "-c", cfgfile, "-E", "setup.ilm.enabled=false", "-modules={}".format(module), "-M", "{module}.*.enabled=false".format(module=module), "-M", "{module}.{fileset}.enabled=true".format(module=module, fileset=fileset), "-M", "{module}.{fileset}.var.input=file".format(module=module, fileset=fileset), "-M", "{module}.{fileset}.var.paths=[{test_file}]".format( module=module, fileset=fileset, test_file=test_file), "-M", "*.*.input.close_eof=true", ] # Based on the convention that if a name contains -json the json format is needed. Currently used for LS. if "-json" in test_file: cmd.append("-M") cmd.append("{module}.{fileset}.var.format=json".format( module=module, fileset=fileset)) output_path = os.path.join(self.working_dir) output = open(os.path.join(output_path, "output.log"), "ab") output.write(" ".join(cmd) + "\n") local_env = os.environ.copy() local_env["TZ"] = 'Etc/UTC' subprocess.Popen(cmd, env=local_env, stdin=None, stdout=output, stderr=subprocess.STDOUT, bufsize=0).wait() # Make sure index exists self.wait_until(lambda: self.es.indices.exists(self.index_name)) self.es.indices.refresh(index=self.index_name) # Loads the first 100 events to be checked res = self.es.search(index=self.index_name, body={ "query": { "match_all": {} }, "size": 100, "sort": { "log.offset": { "order": "asc" } } }) objects = [o["_source"] for o in res["hits"]["hits"]] assert len(objects) > 0 for obj in objects: assert obj["event"][ "module"] == module, "expected event.module={} but got {}".format( module, obj["event"]["module"]) assert "error" not in obj, "not error expected but got: {}".format( obj) if (module == "auditd" and fileset == "log") \ or (module == "osquery" and fileset == "result"): # There are dynamic fields that are not documented. pass else: self.assert_fields_are_documented(obj) self._test_expected_events(test_file, objects) def _test_expected_events(self, test_file, objects): # Generate expected files if GENERATE env variable is set if os.getenv("GENERATE"): with open(test_file + "-expected.json", 'w') as f: # Flatten an cleanup objects # This makes sure when generated on different machines / version the expected.json stays the same. for k, obj in enumerate(objects): objects[k] = self.flatten_object(obj, {}, "") clean_keys(objects[k]) json.dump(objects, f, indent=4, separators=(',', ': '), sort_keys=True) with open(test_file + "-expected.json", "r") as f: expected = json.load(f) assert len(expected) == len( objects), "expected {} events to compare but got {}".format( len(expected), len(objects)) for ev in expected: found = False for obj in objects: # Flatten objects for easier comparing obj = self.flatten_object(obj, {}, "") clean_keys(obj) if ev == obj: found = True break assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format( pretty_json(ev), pretty_json(objects))
from elasticsearch import Elasticsearch es = Elasticsearch('helk-elasticsearch:9200') doc = { "query": { "constant_score": { "filter": { "bool": { "should": [{ "match_phrase": { "event_id": "19" } }, { "match_phrase": { "event_id": "20" } }, { "match_phrase": { "event_id": "21" } }] } } } } } res = es.search(index="logs-endpoint-winevent-*", body=doc) count = res['hits']['total']['value']
import requests import json from elasticsearch import Elasticsearch es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}]) s = requests.Session() def write_mongo(data_dict): try: data_id = es.index(index="wiki", doc_type='wiki', body=data_dict) print(data_id) except Exception as exception: print(exception) def streaming(): #get funciton api wikimedia req = requests.Request("GET",'https://stream.wikimedia.org/v2/stream/recentchange').prepare() resp = s.send(req, stream=True) for line in resp.iter_lines(): if line: yield str(line, 'utf-8') def read_stream(): for line in streaming(): if line.startswith('data'): data_dict = json.loads(line[6:]) # only show non-bot
#from hmmlearn import hmm from sklearn.externals import joblib token_list = [ 'Comparison', 'Punctuation', 'Whitespace', 'Keyword', 'IdentifierList', 'DML', 'Multiline', 'Wildcard', 'Parenthesis', 'Identifier', 'Where', 'Function', 'Single', 'Operator', 'Integer' ] log_likelihoods = [] es_host = "127.0.0.1" es_port = "9200" logs_index = "logs" attack_query = [] model = joblib.load("sqli-hmm.pkl") es = Elasticsearch([{'host': es_host, 'port': es_port}]) sql_log_query = { "query": { "bool": { "must": [{ "match_all": {} }, { "range": { "@timestamp": { "lte": "now", "gte": "now-1m" } } }] }
'django.contrib.auth.password_validation.MinimumLengthValidator', }, { 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', }, { 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', }, ] # Internationalization # https://docs.djangoproject.com/en/1.9/topics/i18n/ LANGUAGE_CODE = 'en-us' TIME_ZONE = 'UTC' USE_I18N = True USE_L10N = True USE_TZ = True from elasticsearch import Elasticsearch, RequestsHttpConnection ES_CLIENT = Elasticsearch(['http://localhost:9200/'], connection_class=RequestsHttpConnection) ES_AUTOREFRESH = True
GenericTransformer, ) es_host = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_HOST', 'localhost') neo_host = os.getenv('CREDENTIALS_NEO4J_PROXY_HOST', 'localhost') es_port = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_PORT', 9200) neo_port = os.getenv('CREDENTIALS_NEO4J_PROXY_PORT', 7687) if len(sys.argv) > 1: es_host = sys.argv[1] if len(sys.argv) > 2: neo_host = sys.argv[2] es = Elasticsearch([ { 'host': es_host, 'port': es_port }, ]) Base = declarative_base() NEO4J_ENDPOINT = f'bolt://{neo_host}:{neo_port}' neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = '******' neo4j_password = '******' LOGGER = logging.getLogger(__name__)
def dashboard_system_bandwidth(runner_config): global es es_ip = runner_config['dashboard_ip'].split(':') es = Elasticsearch([{'host': es_ip[0]}]) # Create bottlenecks index with open(dashboard_dir + 'posca_system_bandwidth_index_pattern.json')\ as index_pattern: doc = json.load(index_pattern) res = es.index(index=".kibana", doc_type="index-pattern", id="bottlenecks", body=doc) if res['created'] == "True": LOG.info("bottlenecks index-pattern has created") else: LOG.info("bottlenecks index-pattern has existed") with open(dashboard_dir + 'posca_system_bandwidth_config.json')\ as index_config: doc = json.load(index_config) res = es.index(index=".kibana", doc_type="config", id="4.6.1", body=doc) if res['created'] == "True": LOG.info("bottlenecks config has created") else: LOG.info("bottlenecks config has existed") # Configure discover panel with open(dashboard_dir + 'posca_system_bandwidth_discover.json')\ as index_discover: doc = json.load(index_discover) res = es.index(index=".kibana", doc_type="search", id="system_bandwidth", body=doc) if res['created'] == "True": LOG.info("system_bandwidth search has created") else: LOG.info("system_bandwidth search has existed") # Create testing data in line graph with open(dashboard_dir + 'posca_system_bandwidth_line_data.json')\ as line_data: doc = json.load(line_data) res = es.index(index=".kibana", doc_type="visualization", id="system_bandwidth_line-date", body=doc) if res['created'] == "True": LOG.info("system_bandwidth_line-date visualization has created") else: LOG.info("system_bandwidth_line-date visualization has existed") # Create comparison results in line chart with open(dashboard_dir + 'posca_system_bandwidth_line_char.json')\ as line_char: doc = json.load(line_char) res = es.index(index=".kibana", doc_type="visualization", id="system_bandwidth_line-char", body=doc) if res['created'] == "True": LOG.info("system_bandwidth_line-char visualization has created") else: LOG.info("system_bandwidth_line-char visualization has existed") # Create local cpu results in line chart with open(dashboard_dir + 'posca_system_bandwidth_local_cpu.json')\ as line_cpu: doc = json.load(line_cpu) res = es.index(index=".kibana", doc_type="visualization", id="system_bandwidth_local_cpu", body=doc) if res['created'] == "True": LOG.info("system_bandwidth_local_cpu visualization has created") else: LOG.info("system_bandwidth_local_cpu visualization has existed") # Create monitoring data in table with open(dashboard_dir + 'posca_system_bandwidth_terms_data.json')\ as terms_char: doc = json.load(terms_char) res = es.index(index=".kibana", doc_type="visualization", id="system_bandwidth_terms_data", body=doc) if res['created'] == "True": LOG.info("system_bandwidth_terms_data visualization has created") else: LOG.info("system_bandwidth_terms_data visualization has existed") # Create dashboard with open(dashboard_dir + 'posca_system_bandwidth_dashboard.json')\ as dashboard: doc = json.load(dashboard) res = es.index(index=".kibana", doc_type="dashboard", id="system_bandwidth_dashboard", body=doc) if res['created'] == "True": LOG.info("system_bandwidth dashboard has created") else: LOG.info("system_bandwidth dashboard has existed")
#!/usr/bin/env python #coding: utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') from elasticsearch import Elasticsearch from time_utils import * import pymysql as mysql import pymysql.cursors import time from config import * from db import get_stock es214 = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) es216 = Elasticsearch([{'host': ES_HOST_WEB0, 'port': ES_PORT_WEB0}]) def defaultDatabase(): conn = mysql.connect(host=SQL_HOST,user=SQL_USER,password=SQL_PASSWD,db=DEFAULT_DB,charset=SQL_CHARSET,cursorclass=pymysql.cursors.DictCursor) conn.autocommit(True) cur = conn.cursor() return cur def get_stock(id): cur = defaultDatabase() stocksql = "SELECT * FROM %s WHERE %s = '%s'" %(TABLE_DAY,DAY_ID,id) cur.execute(stocksql) thing = cur.fetchone() dic = {DAY_STOCK_ID:thing[DAY_STOCK_ID],DAY_START_DATE:thing[DAY_START_DATE],DAY_END_DATE:thing[DAY_END_DATE],DAY_INDUSTRY_CODE:thing[DAY_INDUSTRY_CODE]} return dic
#Refer to README.md file for a detailed code and execution steps import elasticsearch import eland as ed from elasticsearch import Elasticsearch es = Elasticsearch(['host_server_name'], http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'), scheme="https", port=443) #Following syntax is used to create an index into your Elasticsearch es.indices.create(index="My_First_Index", ignore=400) #Check or Fetch the created index df = ed.DataFrame(es, es_index_pattern="mydata") df #Deleting the index es.indices.delete(index="My_First_Index", ignore=400)
# -*- coding: utf-8 -*- from __future__ import (absolute_import, division, print_function, unicode_literals) from elasticsearch import Elasticsearch, RequestsHttpConnection es_client = Elasticsearch( hosts=['localhost:9200/'], connection_class=RequestsHttpConnection )
"elementkey": {"type": "integer"}, "transactionyear": {"type": "year"}, "transactionmonth": {"type": "month"}, "vendor": {"type": "string"} } ES_HOST = {"host": "localhost", "port": 9200} INDEX_NAME = "sdotparking" TYPE_NAME = "transaction" ID_FIELD = "dataid" es_cred_file = open("/home/chase/.escreds", 'r') user = es_cred_file.readline().strip() pswd = es_cred_file.readline().strip() es = Elasticsearch(hosts = [ES_HOST], http_auth=(user, pswd)) datapath = "/home/chase/projects/sdot_data/data/parking_data" fname = sys.argv[1] with open(datapath + "/" + fname, 'r') as f: header = f.readline().strip().split(",") header = [ token.lower() for token in header ] data = [ token.strip().split(",") for token in f.readlines() ] bulk_data = [] for row in data: data_dict = {} for i in range(len(row)): try:
# # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 # which accompanies this distribution, and is available at # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## import ConfigParser from elasticsearch import Elasticsearch import json import os import utils.logger as log from utils.parser import Parser as conf_parser LOG = log.Logger(__name__).getLogger() config = ConfigParser.ConfigParser() es = Elasticsearch() dashboard_path = os.path.join(conf_parser.test_dir, "posca", "testcase_dashboard") dashboard_dir = dashboard_path + "/" def dashboard_send_data(runner_config, test_data): global es es_ip = runner_config['dashboard_ip'].split(':') es = Elasticsearch([{'host': es_ip[0]}]) res = es.index(index="bottlenecks", doc_type=test_data["testcase"], body=test_data["data_body"]) if res['created'] == "False": LOG.error("date send to kibana have errors ", test_data["data_body"])
#encoding:utf-8 from datetime import datetime from image_signature import generate_signature from collections import Counter from elasticsearch import Elasticsearch es_index = "facerecognition" es = Elasticsearch("0.0.0.0", port=9200) class SignatureES(object): """Elasticsearch driver for image-match """ size = 5 def __init__(self, es, index='face', doc_type='face', timeout='10s', size=size, distance_low=0.5, distance_high=0.8): """Extra setup for Elasticsearch Args: es (elasticsearch): an instance of the elasticsearch python driver index (Optional[string]): a name for the Elasticsearch index (default 'images') doc_type (Optional[string]): a name for the document time (default 'image') timeout (Optional[int]): how long to wait on an Elasticsearch query, in seconds (default 10)
#!/usr/bin/env python import json from elasticsearch import Elasticsearch es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) f = open("masscan.json") for line in f: try: print("inserting " + json.loads(line[:-2])['ip']) except: print("whoops") continue # ip is unique for "hosts", and "services" indexes all ports es.index(index='masscan_hosts', doc_type="_doc", id=json.loads(line[:-2])['ip'], body=line[:-2]) es.index(index='masscan_services', doc_type="_doc", body=line[:-2])
import os import random import textwrap from databuilder.extractor.neo4j_dashboard_search_data_extractor import Neo4jDashboardSearchDataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.extractor.neo4j_extractor import Neo4jExtractor from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.task.task import DefaultTask from elasticsearch import Elasticsearch # set env ES_HOST to override localhost es = Elasticsearch([ { 'host': os.getenv('ES_HOST', 'localhost') }, ]) # set env NEO4J_HOST to override localhost NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost')) neo4j_endpoint = NEO4J_ENDPOINT neo4j_user = '******' neo4j_password = '******' DASHBOARD_ES_MAP = textwrap.dedent(""" { "mappings":{ "dashboard":{ "properties": {
#Init Albert pipeline qa_pipeline = pipeline('question-answering', model="ktrapeznikov/albert-xlarge-v2-squad-v2", tokenizer="albert-xlarge-v2", device=0) question = "What is the capital of the Netherlands?" context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore." #initQA = qa_pipeline(question=question, context=context) #print(initQA) # Grab Elasticsearch instance config = {'host': 'mc.ocbe.de', 'port': 9200} es = Elasticsearch([config]) # test connection es.ping() app = Flask(__name__) workingDir = 'C:\\Users\\Chris\\Documents\\GitHub\\ElasticAlbertFrontend' ftpFolder = 'C:\\Users\\Chris\\Documents\\GitHub\\ElasticAlbertFrontend\\uploads' @app.route("/") def serve_app(): return render_template('index.html') @app.route('/upload', methods=['GET', 'POST'])
'wind_degrees', 'wind_gust_kph', 'wind_gust_mph', 'wind_kph', 'wind_mph', ] for entry in floats: try: esObject['_source']['current_observation'][entry] = float(esObject['_source']['current_observation'][entry]) except: pass try: esObject['_source']['current_observation']['observation_location']['longitude'] = float(esObject['_source']['current_observation']['observation_location']['longitude']) esObject['_source']['current_observation']['observation_location']['latitude'] = float(esObject['_source']['current_observation']['observation_location']['latitude']) except: pass #making a bulkObject list so it will be easier to do multiple cities later on bulkObject = [] bulkObject.append(esObject) es = Elasticsearch([esHost], sniff_on_start=True) es.indices.create(index=esIndex, body=esIndexSettings, ignore=400) if len(bulkObject) > 0: helpers.bulk(es, bulkObject) exit()
class ElasticRetrieval(BaseRetrieval): """ Interfaces with the Elasticsearch API """ def __init__(self, index_name, method, logger=None, use_default_similarity=True, max_results=None, es_instance=None, save_terms=False, multi_match_type=None): self.index_name = index_name if es_instance: self.es = es_instance else: if cp.Corpus.__class__.__name__ == "ElasticCorpus": self.es = cp.Corpus.es else: self.es = Elasticsearch(timeout=QUERY_TIMEOUT) if not cp.Corpus.isIndexOpen(self.index_name): try: self.es.indices.open(self.index_name) time.sleep(10) except TransportError as e: print(e) if max_results: self.max_results = max_results else: self.max_results = MAX_RESULTS_RECALL self.method = method # never used! self.logger = logger self.last_query = {} self.save_terms = save_terms self.default_field = "text" self.tie_breaker = 0 if not multi_match_type: self.multi_match_type = "best_fields" else: self.multi_match_type = multi_match_type def rewriteQueryAsDSL1(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query lucene_query = "" for token in structured_query: # TODO proper computing of the boost formula. Different methods? t_boost = token.boost t_count = token.count if t_boost is None: print("NULL! ") print(token, token.boost, token.count) t_boost = 0 if t_count is None: print("NULL! ") print(token, token.boost, token.count) t_count = 0 boost = t_boost * t_count if boost == 0.0: continue bool_val = token.bool or "" token_text = token.token if " " in token_text: # if token is a phrase token_text = "\"" + token_text + "\"" lucene_query += "%s%s " % (bool_val, token_text) ## if boost != 1: ## lucene_query+="^%s" %str(boost) if boost != 1: token_str = token_text + " " lucene_query += bool_val + (token_str * int(boost - 1)) lucene_query = lucene_query.strip() lucene_query += " " lucene_query = lucene_query.replace(" ", " ") fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = { "multi_match": { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } ## print(dsl_query) if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker return dsl_query def rewriteQueryAsDSL2(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. Version 2 :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query lucene_query = "" for token in structured_query: boost = token.boost * token.count bool_val = token.bool or "" token_text = token.token if " " in token_text: # if token is a phrase token_text = "\"" + token_text + "\"" lucene_query += "%s%s " % (bool_val, token_text) if boost != 1: token_str = token_text + " " lucene_query += bool_val + (token_str * int(boost - 1)) lucene_query = lucene_query.strip() lucene_query += " " elastic_query = {"bool": {"should": []}} fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = { "multi_match": { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } ## print(dsl_query) if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker return dsl_query def rewriteQueryAsDSL(self, structured_query, parameters): """ Creates a DSL query for elasticsearch. Version 3, uses individual "term" and "match" queries :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if isinstance(structured_query, dict) and "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query field_dicts = [] for token in structured_query: # TODO proper computing of the boost formula. Different methods? boost = token.boost * token.count bool_val = token.bool or "" token_text = token.token # if " " in token_text: # if token is a phrase # token_text = "\"" + token_text + "\"" if boost == 0.0: continue for field in parameters: if " " in token_text: new_dict = { "match_phrase": { field: { "query": token_text, "boost": parameters[field] * boost }, } } else: new_dict = { "term": { field: { "value": token_text, "boost": parameters[field] * boost }, } } field_dicts.append(new_dict) fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = {"bool": {"should": field_dicts}} return dsl_query def runQuery(self, structured_query, max_results=None): """ Interfaces with the elasticsearch query API """ if not structured_query or len(structured_query) == 0: return [] if not max_results: max_results = self.max_results self.last_query = dict(structured_query) dsl_query = self.rewriteQueryAsDSL( structured_query["structured_query"], [self.default_field]) res = self.es.search( body={"query": dsl_query}, size=max_results, index=self.index_name, doc_type=ES_TYPE_DOC, request_timeout=QUERY_TIMEOUT, ) structured_query["dsl_query"] = dsl_query hits = res["hits"]["hits"] ## print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query)) ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query result = [] for hit in hits: metadata = hit["_source"]["metadata"] result.append((hit["_score"], metadata)) return result def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: StructuredQuery dict, with a "dsl_query" key :param doc_id: id of document to run .explain() for :returns: """ explanation = None retries = 0 while retries < 1: try: explanation = self.es.explain( index=self.index_name, doc_type=ES_TYPE_DOC, body={"query": query["dsl_query"]}, id=doc_id, request_timeout=QUERY_TIMEOUT, ) break except Exception as e: ## logging.error("Exception, retrying...") retries += 1 if retries > 0: if retries == 1: logging.error( "Retried {} times, failed to retrieve.".format(retries + 1)) else: logging.warning("Retried %d times, retrieved successfuly." % (retries + 1)) formula = StoredFormula() if explanation: formula.fromElasticExplanation(explanation, self.save_terms) return formula
from requests_aws4auth import AWS4Auth from watson_developer_cloud import NaturalLanguageUnderstandingV1 from watson_developer_cloud.natural_language_understanding_v1 import Features, SentimentOptions print('TweetProcessing Lambda Function Running') cred = boto3.session.Session().get_credentials() host = '' awsauth = AWS4Auth(cred.access_key, cred.secret_key, 'us-east-2', 'es', session_token=cred.token) es = Elasticsearch( hosts=[{'host': host, 'port': 443}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) natural_language_understanding = NaturalLanguageUnderstandingV1( version='2017-02-27', username='******', password='******') def lambda_handler(event, context): message = event['Records'][0]['Sns']['Message'] tweets = json.loads(message) try: es_count = es.count(index="tweet-index")['count'] except:
#!/usr/bin/env python import re import os from urllib.parse import urlparse from urllib.parse import urljoin import hashlib import json import sys from elasticsearch import Elasticsearch es= Elasticsearch() title_re = re.compile(r'([=*]{3,})\n([^\n]+)\n\1\n') def find_title(rst): matches = title_re.findall(rst)] if not matches: return '' else: return matches[0][1] def walk_documentation(path='.', base_url=''): path = os.path.realpath(path) for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: if filename.endswith('.txt'): filepath = os.path.join(dirpath, filename)
def __init__(self, host, port): url = "%s:%s" % (host, port) try: self.client = Elasticsearch([url], send_get_body_as="POST") except: logger.error('elasticsearch cannot connect')
while os.access(lock_file,os.F_OK): # logging.write("Waiting a second ...\n") time.sleep(1) # __main__ # We need to have two command-line args: # sys.argv[1]: The node name or "cluster" # sys.argv[2]: The "key" (status, filter_size_in_bytes, etc) if len(sys.argv) < 3: zbx_fail() # Try to establish a connection to elasticsearch try: conn = Elasticsearch('localhost:9200', sniff_on_start=False) except Exception, e: zbx_fail() if sys.argv[1] == 'cluster' and sys.argv[2] in clusterkeys_direct: nodestats = None # now=time.strftime("%Y%m%d-%H:%M:%S") if use_cache(clustercache_file): # logging.write(str(now) + ": Using cluster cache\n") nodestats = shelve.open(clustercache_file) nodestats = nodestats['stats'] else: # logging.write(str(now) + ": Generate lockfile and cluster cache\n") lock=open (lock_file, "w") try: nodestats = conn.cluster.stats()
# coding:utf-8 import datetime from elasticsearch import Elasticsearch from elasticsearch_dsl import Search from .example1 import Article client = Elasticsearch({"host": "localhost", "port": 9200}) s = Search(using=client) def add_article(id_, title, body, tags): now = datetime.datetime.utcnow() article = Article(meta={'id': id_}, title=title, tags=tags) article.body = body article.published_from = now article.created_at = now article.save() return article def init_test_data(): add_article(2, 'Python is good!', 'Python is good!', ['python']) add_article(3, 'Elasticsearch', 'Distributed, open source search and analytics engine', ['elasticsearch']) add_article(4, 'Python very quickly', 'Python very quickly', ['python']) add_article(5, 'Django', 'Python Web framework', ['python', 'django'])
from elasticsearch import Elasticsearch, helpers sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from server.config import ConfigClass logging.basicConfig( level=logging.INFO, filename=ConfigClass.tag_indexer_logfile, format='%(asctime)s - %(levelname)s - %(module)s %(funcName)s : %(message)s' ) NEXT_FILE = "next_file" es = Elasticsearch(ConfigClass.es_nodes) def save_next(next, apikey): try: with open(NEXT_FILE + apikey, 'w') as f: f.write(next) except Exception as e: logging.error("Unable to write to disk: {0}".format(e)) def get_next(apikey): ''' Retrieves the 'next' variable received from VT on the last run''' lastrun = None if os.path.exists(NEXT_FILE): with open(NEXT_FILE) as f:
from elasticsearch import Elasticsearch from pprint import pprint import sys import numpy as np es = Elasticsearch('http://172.27.125.139:9200/', timeout=10, retry_on_timeout=True, max_retries=1) doc = es.get_source(index="state_bills", id='az_49th-3rd-special_SB1010', doc_type="_all") print(len(doc['bill_document_last'])) sys.exit() with open('bill_ids.txt') as infile: ids = [x.strip('\n') for x in infile] # #o = np.zeros((len(ids))) #for i, id_ in enumerate(ids): # doc = None # s = 'failed' # doc = es.get_source(index="state_bills", id=id_, doc_type="_all") # if doc is not None: # o[i] = 1 # s = 'worked' # # print('{}: {}, {}'.format(s, i, id_)) #
from elasticsearch import Elasticsearch from training.train_d2v import TrainDoc2Vec from training.training_prefix import makeTrainingPrefix import os es_url = os.environ['AC_SIM_ES_URL'] if os.environ.get( 'AC_SIM_ES_URL') != None else 'localhost:9200' es = Elasticsearch(es_url) class TrainingManager: def __init__(self, indexName, docType, object): self.indexName = indexName self.docType = docType self.object = object if (object.get("domain_id") != None): self.searchTerms = {"domain_id": int(object["domain_id"])} self.collectionIndexName = "domains_" + object["cluster_id"] self.colletionIndexDocType = "domain" self.collectionIndexSearchId = int(object["domain_id"]) elif (object.get("community_id") != None): self.searchTerms = {"community_id": int(object["community_id"])} self.collectionIndexName = "communities_" + object["cluster_id"] self.colletionIndexDocType = "community" self.collectionIndexSearchId = int(object["community_id"]) elif (object.get("group_id")): self.searchTerms = {"group_id": int(object["group_id"])} self.collectionIndexName = "groups_" + object["cluster_id"] self.colletionIndexDocType = "group" self.collectionIndexSearchId = int(object["group_id"]) elif (object.get("post_id")):