Example #1
0
def save_chambana():
    es = Elasticsearch()

    query1 = {"query": {"match": {"city": "Urbana" } } }
    query2 = {"query": {"match": {"city": "Champaign" } } }
    res1 = es.search(index="business", body=query1, size=300)
    res2 = es.search(index="business", body=query2, size=400)

    print("Got %d Hits:" % res1['hits']['total'])
    print("Got %d Hits:" % res2['hits']['total'])

    urban = open('urban.json','w')
    champ = open('champ.json','w')

    template = { "create": { "_index": "urban", "_type": "doc"} }
    for hit in res1['hits']['hits']:
        json.dump(template,urban)
        urban.write("\n")

        json.dump(hit['_source'],urban)
        urban.write('\n')

    template = { "create": { "_index": "champ", "_type": "doc"} }
    for hit in res2['hits']['hits']:
        json.dump(template,champ)
        champ.write("\n")

        json.dump(hit['_source'],champ)
        champ.write('\n')
def get_judge_res(judge_image_dir):
    es = Elasticsearch(esport)
    judge_image_dir = 'judgeresult:' + judge_image_dir
    search_size = 20
    search_offset = 0
    print request.args
    try:
        if 'offset' in request.args:
            search_offset = int(request.args.get('offset'))
        if 'size' in request.args:
            search_size = int(request.args.get('size'))
        res_index = es.search(
            index = judge_image_dir, 
            size = search_size, 
            from_=search_offset
        )
    except:
        del(es)
        return 'Error: index do not exist\n'
    res_lst = []
    for item in res_index['hits']['hits']:
        res_lst.append(item['_source']['file'])
    res_dict = {
        'total' : res_index['hits']['total'],
        'file_list' : res_lst,
        'from_' : search_offset,
        'size' : len(res_index['hits']['hits'])
    }
    json_res = json.dumps(res_dict)
    del(es)
    return json_res
Example #3
0
def reindex(old_index, new_index, s):
    ''' Function to reindex by scan and scroll combined with a bulk insert.
    old_index is the index to take docs from, new_index is the one the docs go to.
    s is the size of each bulk insert - should set this as high as the RAM
    on the machine you run it on allows.  500-1000 seems reasonable for t2.medium '''
    def create_bulk_insert_string(results, index):
        ret_str = ''
        for hit in results:
            ret_str += '{"create":{"_index":"' + index + '","_type":"variant","_id":"' + hit['_id'] + '"}}\n'
            ret_str += json.dumps(hit) + '\n'
        return ret_str

    es = Elasticsearch('localhost:9200')
    s = es.search(index=old_index, body='{"query": {"match_all": {}}}', search_type='scan', scroll='5m', size=s)
    curr_done = 0

    try:
        while True:  # do this loop until failure
            r = es.scroll(s['_scroll_id'], scroll='5m')
            this_l = [res['_source'] for res in r['hits']['hits']]
            this_str = create_bulk_insert_string(this_l, new_index)
            es.bulk(body=this_str, index=new_index, doc_type='variant')
            curr_done += len(this_l)
    except:
        print('{} documents inserted'.format(curr_done))
    def es_search(self,p_host,p_port,p_index,p_query):
        """
        Returns a query result from elastic search
        The result is the response from elasticsearch as a dictionnary.

        {p_host}   Elasticsearch server\n
        {p_port}   Port of the es server\n
        {p_index}  Name of the index to query\n
        {p_query}  Query to run\n

        | ${res} = | es search | localhost | 9200 | myIndex |  {"query":{"query_string":{"query": "searched value"}}} |
        """
        
        # Es client
        try:
            param = [{'host':p_host,'port':int(p_port)}]
            es = Elasticsearch(param)
        except Exception:
            raise AssertionError("Connexion error on %s:%i",p_host,int(p_port))

        try:
            documents = es.search(body=p_query, index=p_index)
        except Exception:
            raise AssertionError("Search error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query)

        return documents
Example #5
0
    def count(self, p_index, p_query={}):
        """Gets the number of docs for a query

            p_index:    elasticsearch index where to query
            p_query:    the query to process

            return the number of docs from the index p_index and the query p_query
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s', json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            result = es.count(index=p_index, body=p_query)
            logger.info('Count the number of items from %s for the query %s', p_index, p_query)
        except Exception as e:
            logger.error('Error querying the index %s with query %s', p_index, p_query)
            logger.error(e)

        return result['count']
Example #6
0
class ElasticStorage(BaseStorage):
    def __init__(self, config):
        if not Elasticsearch:
            raise ImportError("elasticsearch-py is required to use Elasticsearch as storage.")
        if not Search:
            raise ImportError("elasticsearch_dsl is required to use Elasticsearch as storage.")

        self.name = 'elasticsearch'
        self.storage = Elasticsearch(**config)

    def keys(self, pattern="*"):
        return self.storage.keys(pattern)

    def set_val(self, key, val):
        body = {
            'key': key,
            'val': ','.join(map(str, val[0])),
            'extra': str(val[1])
        }
        self.storage.index(index='sift', doc_type='sift', body=body)

    def get_val(self, key):
        s = Search(using=self.storage, index='sift')
        return s.filter('term', key=key).execute().hits.hits

    def append_val(self, key, val):
        self.set_val(key, val)

    def get_list(self, key):
        return self.get_val(key)
Example #7
0
def show(ctx, path, order):
    router = Router(open(ctx.obj['CONFIG']))
    route = router.match(path)
    logging.debug("Matched route: %s" % route)
    if not route:
        print 'No queries matched'
        return
    es = Elasticsearch(hosts=route.get('elasticsearch_url'))
    request_body = {}
    for non_mandatory_key in ['sort', 'query']:
        value = route.get(non_mandatory_key)
        if value:
            request_body[non_mandatory_key] = value
    if order == 'asc':
        request_body['sort'] = {'@timestamp': 'asc'}
    elif order == 'desc':
        request_body['sort'] = {'@timestamp': 'desc'}
    elif order:
        click.echo("Unknown order format: %s" % order, err=True)
        return 1
    logging.debug("Query: %s" % (request_body,))
    result = es.search(index=route.get('index'), doc_type=None, body=request_body)
    hits = result['hits']['hits']
    template = Template(route.get("format", "{{ __at_timestamp }} {{ message }}"))
    for hit in hits:
        doc = hit['_source']
        doc['__at_timestamp'] = doc.get('@timestamp')
        print template.render(doc)
Example #8
0
def iter_elastic_query(instance, index, field, subfield=None):
    es = Elasticsearch(instance)

    # initial search
    resp = es.search(index, body={"query": {"match_all": {}}}, scroll='5m')

    scroll_id = resp.get('_scroll_id')
    if scroll_id is None:
        return

    first_run = True
    while True:
        for hit in resp['hits']['hits']:
            s = hit['_source']
            try:
                if subfield is not None:
                    print(s[field][subfield])
                    yield s[field][subfield]
                else:
                    yield s[field]
            except ValueError:
                    logging.warning("Unable to process row: %s" %
                                    str(hit))

        scroll_id = resp.get('_scroll_id')
        # end of scroll
        if scroll_id is None or not resp['hits']['hits']:
            break
def main():
    beanstalk = beanstalkc.Connection(host=MYHOST, port=11301)
    es = Elasticsearch()

    # ignore 400 cause by IndexAlreadyExistsException when creating an index
    es.indices.create(index='grmoto', ignore=400)

    # ignore 404 and 400
    es.indices.delete(index='grmoto', ignore=[400, 404])


    try:
        while True:
            # To receive a job:
            job = beanstalk.reserve()
        
#           if job.body == 'quit':
#               print 'The agent shutting down'
#               break


            # Work with the job:
            obj = json.loads(job.body)
            
            #print json.dumps(obj, sort_keys=True, indent=4, separators=(',',': '))

            #use elastic search 
            res = es.index(index='grmoto', doc_type='native_objects', body = obj)
            print(res['created'])
            
            #Release the job
            job.delete()
    except:
        again()
Example #10
0
    def GET(self):
        es = Elasticsearch(conf['fulltext']['serviceUrl'])
        if web.input(wildcard_query=None).wildcard_query:
            query = {
                "wildcard": {
                    "_all": web.input().query
                }
            }
            self.set_wildcard_query(True)
        else:
            query = {
                "multi_match": {
                    "query": web.input().query,
                    "operator": "and",
                    "fields": ["text", "pageName", "tags"]
                }
            }
            self.set_wildcard_query(False)
        res = es.search(index=conf['fulltext']['indexName'],
                        body={"query": query,
                              "fields": ["pageName", "path", "fsPath", "text"]})
        rows = []
        for a in res['hits']['hits']:
            fields = a['fields']

            fs_path = os.path.normpath('%s/%s.md' % (self.data_dir, fields['path'][0]))
            page_chapters, h1 = extract_description(fs_path)
            rows.append({
                'h1': h1 if h1 else fields['path'][0],
                'file': fields['path'][0],
                'chapters': page_chapters
            })
        values = dict(query=web.input().query, ans=rows)
        return self._render('search.html', values)
Example #11
0
def search(query='', field='q1', _operator='and', sort=[('_score', 'desc'), ('quoted_by', 'desc')],
           _filter={}, size=1000, _id=False):
    es = Elasticsearch([elasticsearch_setting])
    if query:
        es_query = {
            'match': {
                field: {
                    'query': query,
                    'operator': _operator,
                    'minimum_should_match': '85%'
                }
            }
        }
    else:
        es_query = {"match_all": {}}
    body = {
        "query": {
            "filtered": {
                "query": es_query,
                "filter": _filter
            }
        },
        'size': size
    }
    sort_item = _build_sort(sort)
    if sort_item:
        body.update({'sort': sort_item})
    logger.debug(body)
    result = es.search(index='qwerty', body=body, _source=True, timeout=55)
    if _id:
        return (x for x in result['hits']['hits'])
    return (x['_source'] for x in result['hits']['hits'])
Example #12
0
def search_index(index, searchdict, start=0, host='127.0.0.1', port=9200):
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html

    '''
    #print searchdict
    pprint(searchdict)
    #import pdb; pdb.set_trace()
    thisurl = 'http://%s:%s/%s/_search' % (host, port, path)
    r = requests.get(thisurl, data=json.dumps(searchdict), verify=False)
    print r.reason
    '''

    maxcount = 10000
    es = Elasticsearch()
    res = es.search(index=index, body=searchdict, size=maxcount, scroll='1m')

    # hits.total is the total count of matches, but not the amount returned
    #total = res['hits']['total']
    scroll = es.scroll(scroll_id=res['_scroll_id'])
    res['hits']['hits'] += scroll['hits']['hits']

    return res
Example #13
0
    def do_POST(self):
        global  csvPath

        try:
            content_len = int(self.headers.getheader('content-length', 0))
            body = json.loads(self.rfile.read(content_len))

            dict = {"url" : body['url'], "text" : body['text']}

            es = Elasticsearch()
            es.index(index="articles", doc_type="article", body=dict)

            with open(csvPath,'ab') as fout:
                writer = UnicodeWriter(fout,quoting=csv.QUOTE_ALL)
                writer.writerow(dict.values())


            self.send_response(200)
            self.send_header("Content-type", "application/json")
            self.end_headers()
            self.wfile.write( json.dumps({"result":True}) )

        except Exception, e:

            exc_type, exc_obj, exc_tb = sys.exc_info()
            print(" Type: %s | File: %s | Line number: %s " % (exc_type, os.path.abspath(__file__), exc_tb.tb_lineno))
            print e.message

            self.send_response(500)
            self.send_header("Content-type", "application/json")
            self.end_headers()
            self.wfile.write( json.dumps({"result":False}) )
Example #14
0
def hit_es( threadNum, times):
	#connect to our cluster
	es = Elasticsearch([{'host': host_es, 'port': 9200}])
	for i in range(hits_per_thread):
		if i%report_time==0:
			print "On the way! "+ str(i)+" queries done!"

		while True:
		    try:
		    	
		        result = es.search(  index= index_name,		
		            body=query,
						analyze_wildcard = 'true'
				, timeout = timeout_value)
				
		    except:
		        print "Connection time-out occured. Consider a bigger time-out limit"
		        time_outs = time_outs + 1
		        continue
		    break
            
		

		
		#print finish_time
		real_time = result['took']
		#print real_time
		times.append(real_time)
		#print result['hits']['total']
	print "Thread " + str(threadNum) + " finished... \n\n\n"
class IndexTalks:

    index_name = 'gc'
    doc_type = 'talk'

    def __init__(self):
        self.ft = FetchTalks()
        self.es = Elasticsearch()
        self.es_id_seq = 0
        self.confId = ''

    def _FetchIndividualTalk(self, url):
        return urllib.request.urlopen(url)

    def FetchTalksAndIndexThem(self, weekendUrl):
        self.confId, talkUrls = self.ft.FetchTalks(weekendUrl)
        print(str.format('confId: {}, num talk urls: {}', self.confId, len(talkUrls)))
        for url in talkUrls:
            handle = self._FetchIndividualTalk(url)
            self._InsertOneTalkIntoES(handle, url)

    def _GetNextId(self):
        result = self.es_id_seq
        self.es_id_seq = self.es_id_seq + 1
        return result

    def _GetTitleAndAuthor(self, line, tag, tagIndex):
        titleString = HtmlTagParser.GetTagContents(tag, line, tagIndex)
        print('title string: ' + titleString)
        titleSegments = titleString.split('-')
        title = titleSegments[0].strip()
        author = titleSegments[1].strip()
        if author.find('By') == 0:
            author = author[3:].strip()
        return ( title, author )

    def _GetTitleAuthorContent(self, talkHandle):
        title = ''
        author = ''
        titleOpenTag = '<title>'
        titleFound = False
        talkContent = ''
        for line in talkHandle:
            #strLine = str(line)
            strLine = line.decode()
            talkContent = talkContent + strLine
            if titleFound == False:
                titleIndex = strLine.find(titleOpenTag)
                if titleIndex != -1:
                    title, author = self._GetTitleAndAuthor(strLine, titleOpenTag, titleIndex)
                    titleFound = True
        return ( title, author, talkContent )

    def _InsertOneTalkIntoES(self, talkHandle, talkUrl):
        title, author, talkContent = self._GetTitleAuthorContent(talkHandle)
        idnum = self._GetNextId()
        idNumStr = str(idnum)
        print('indexing doc num: ' + idNumStr)
        json_body = json.dumps({'talkSortId': idNumStr, 'title': title, 'author': author, 'confid': self.confId, 'content': talkContent, 'url': talkUrl})
        self.es.index(index=self.index_name, doc_type=self.doc_type, id=idnum, body=json_body)
Example #16
0
def query_elastic(string):
    es = Elasticsearch()
    res = es.search(index="documents_analyzed", doc_type="articles", body={"query": {"match": {"_all": string}}})
    tamano=res['hits']['total']
    res = es.search(index="documents_analyzed", doc_type="articles", body={"size" : tamano,"query": {"match": {"_all": string}},"sort": { "date": { "order": "desc" }}})
    res['hits']['hits']#este es un json con los datos de 0 a n
    return res['hits']['hits']
Example #17
0
def query_and_dump_reults(args):
    es = Elasticsearch([args.hostname + ':' + str(args.port)])

    query = '{"query":{"match_all":{}}}'
    if args.query is not None:
        query = args.query

    doc_type = None
    if args.doc_type is not None:
        doc_type = args.doc_type

    target = "output.csv"
    if args.target is not None:
        target = args.target

    res = es.count(index=args.index, body=query)
    nhits = res['count']

    counter = 0
    bar = progressbar.ProgressBar(max_value=nhits)

    res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type)
    fields = args.fields.split(',')
    with open(target, 'w') as csvfile:
        datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
        datawriter.writerow(fields)
        for item in res:
            item = item['_source']
            datawriter.writerow([get_var(item, field) for field in fields])

            counter += 1
            bar.update(counter)
        bar.finish()
Example #18
0
def loadNerOutputs(anndir):

    #setIds = ["Citalopram-4259d9b1-de34-43a4-85a8-41dd214e9177","Escitalopram-13bb8267-1cab-43e5-acae-55a4d957630a","Fluoxetine-5f356c1b-96bd-4ef1-960c-91cf4905e6b1"]
    #setIds = ["55816042-946d-4bec-9461-bd998628ff45","c00d1607-ac36-457b-a34b-75ad74f9cf0a","70b079e2-a1f7-4a93-8685-d60a4d7c1280","38642D80-AAA6-4196-A033-3977FF35B48A"]

    ann_ner = loadJsonFromDir(anndir)
    #print len(ann_ner)

    #idx = 1
    for ann in ann_ner:
        
        #if ann["setId"] in setIds:
        dict_paras = parseSingleResource(ann)
        ann_domeo = buildAnnotation(dict_paras, SAMPLE_DOMEO)

        # load all annotations
        if ann_domeo:

            # load 11 - 208
            #if ann_domeo and (int(dict_paras["fileId"]) > 10):

            es = Elasticsearch()
            es.index(index="domeo", doc_type=COLLECTION, id=dict_paras["mongo_uuid"], body=json.dumps(ann_domeo))

            insert_annotation(dict_paras)
            print "[INFO] load annotations:" +str(ann["setId"]) 
            #print "load annotations for " + dict_paras["annotates_url"]

            #idx = idx + 1
        else:
            print "[ERROR] annotation empty"
def loadDatainES(filename, index, doctype,dataFileType,hostname="localhost",port=9200,mappingFilePath=None,username="",password="",protocol="http"):
    try:
        print "Connecting to " + hostname + " at port:" + str(port) 
       # es = Elasticsearch([{'host': hostname, 'port': port}])

        if username != "" and password != "":
            es = Elasticsearch([protocol + '://' + username + ':' + password + '@'+hostname + ":" + str(port)],show_ssl_warnings=False)
        else:
            es = Elasticsearch([protocol + '://'+hostname + ":" + str(port)],show_ssl_warnings=False)
        
        if mappingFilePath:
            with open(mappingFilePath) as m:
                mapping = m.read()
                #print "Mapping file:" + mapping
                es.indices.create(index=index,  body=mapping,ignore=400)
                
        if dataFileType=="1":
            with open(filename) as f:   
                d = json.load(f)
                for wp in d:
                  res = es.index(index=index,doc_type=doctype,body=wp,id=wp["uri"])
                  print "indexing id: " + res["_id"] + " for uri: " + wp["uri"]
        elif dataFileType == "0":
            with open(filename) as f:
                lines = f.readlines()

                for line in lines:
                    if line.strip() != "":
                        jsonurlobj = json.loads(line.strip())
                        objkey = jsonurlobj['uri']
                        res = es.index(index=index,doc_type=doctype,body=line)
                        print "indexing id: " + res["_id"] + " for uri: " + objkey
    except Exception, e:
        print >> stderr.write('ERROR: %s\n' % str(e))
Example #20
0
def shipCurToElastic(es_tag, out_dict, deltaSecs):
  host = 'elastic2' 
  es = Elasticsearch([{'host': host}])
  indexBase = 'eventstore'
  ix = curIndexName(indexBase)

  id = out_dict['@timestamp']

  if 0:
    print(
      ix,
      "stats", 
      id, #42, # use timestamp.
      out_dict #{"any": "data", "timestamp": datetime.datetime.now()}
      )
  else:
    try:
      w = es.index(
        index=ix,
        doc_type="stats", 
        id=id, #42, # use timestamp.
        body=out_dict #{"any": "data", "timestamp": datetime.datetime.now()}
      )
      print(es_tag, 'delta:', deltaSecs, 'write:', w)
    except elasticsearch.exceptions.ConnectionTimeout as e:
      print("couldnt ship", e)
class ElasticSearchManager(object):

	def __init__(self, index=None, doc_type=None, *args, **kwargs):
		self.index = index
		self.doc_type = doc_type
		self.obj_es = Elasticsearch()

	def search(self, query = None, *args, **kwargs):
		data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}})
		return fetch_source(data['hits']['hits'])

	def get(self, *args, **kwargs):
		data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id'])
		return data['_source']

	def get_list(self, *args, **kwargs):
		data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}})
		return fetch_source(data['hits']['hits'])

	def insert(self, data = None):
		data = json.loads(data)
		data['user_name'] = data['user']['screen_name']
		del data['user']
		del data['entities']
		res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data)
		logger.info("Getting stream:{0}".format(res))

	def delete(self, data = None):
		pass

	def update(self, data = None):
		pass
Example #22
0
def search(request):
	if request.method == 'POST':
		data = request.POST
		if not data:
			return _error_response(request, "Failed.  No query received")
		query = data['query']
		es = Elasticsearch(['es'])
		result = es.search(index='listing_index', body={'query': {'query_string': {'query': query}}})

		courses_data = result['hits']['hits']
		courses_list = []
		for c in courses_data:
			course = {}
			course['name'] = c['_source']['name']
			course['pk'] = c['_source']['pk']
			course['description'] = c['_source']['description']
			courses_list.append(course)
		#return a list dictionary (each dictionary is a course)
		return JsonResponse(courses_list, safe=False)
	else:
		es = Elasticsearch(['es'])
		result = es.search(index='listing_index', body={'query': {'query_string': {'query': 'calculus'}}, 'size': 10})
		courses_data = result['hits']['hits']
		courses_list = []
		for c in courses_data:
			course = {}
			course['name'] = c['_source']['name']
			course['pk'] = c['_source']['pk']
			course['description'] = c['_source']['description']
			courses_list.append(course)
		return JsonResponse(result, safe=False)

		return JsonResponse({'work': True, 'resp': courses_list}, safe=False)
def searchThroughSearchBar(hotel_id, query):
    '''This function is used for search reviews of a hotel by query.

       hotel_id is the hotel_id in hotels.db
       query is input query from review search bar. 

    '''
    es = Elasticsearch()
    indexName = "reviews_es_index"
    doc_type = "review"
    query_body = {
                   "query": {
                        "bool": {
                            "must": [],
                            "should":[]
                         }
                    },
                   "highlight":{
                        "pre_tags":['<em style="background-color:yellow">'],
                        "post_tags":["</em>"],
                        "fields":{"content":{"fragment_size": 500}}
                 }}
    query_body["query"]["bool"]["must"].append({"match":{"hotel_id":hotel_id}})
    query_body["query"]["bool"]["should"].append({"match":{"content":query}})
    query_body["query"]["bool"]["should"].append({"match":{"title":query}})
    res = es.search(indexName, body=query_body)
    res = res["hits"]["hits"]
    for i in range(len(res)):
        res[i] = res[i]["_source"]
    return res
def searchDocument(id):
    es = Elasticsearch(
        ['https://cdr-es.istresearch.com:9200/memex-qpr-cp4-2'],
        http_auth=('cdr-memex', '5OaYUNBhjO68O7Pn'),
        port=9200,
        use_ssl=True,
        verify_certs = True,
        ca_certs=certifi.where(),
    )
    query_body = {
        "query":{
          "bool": {
              "must": {
                 "match": {
                     "_id":id
                 }
                }
           }
       }
    }
    response = es.search(body=query_body,request_timeout=60)
    document = response["hits"]["hits"]
    if document:
        return document[0]
    else:
        return document
def get_event_location(placeid):
    _query = {
            "query": {
                "bool": {
                    "must": [
                            ]
                    }
                }
            }

    _query["query"]["bool"]["must"].append({"match": {"placeid": "%s" % placeid}})

    search_query = json.dumps(_query)
    es = Elasticsearch(hosts = [ES_HOST])
    res = es.search(index="place", size=10, body=search_query)

    resp = []
    for hit in res['hits']['hits']:
        resp.append(hit["_source"])

    lat=-1.0
    lon=-1.0

    if resp and len(resp) >= 1:
        try:
            lat = resp[0].get('location').get('lat')
            lon = resp[0].get('location').get('lon')
        except:
            print "failed to get lat, lon"

    return lat, lon
Example #26
0
def api_get_all(request, page):
    page = int(page)

    es = Elasticsearch()
    allowed_media_types = [Media.BOOK, Media.AUDIOTALK, Media.VIDEOTALK, Media.PODCAST, Media.MOVIE, Media.MAGAZINE]
    if 'media_types' in request.GET:
        # split string "BK,AT," into array at "," and remove empty itemes
        allowed_media_types = filter(None,request.GET['media_types'].split(','))
    body = {
        "from" : (page - 1) * settings.MAX_ITEM_COUNT,
        "size" : settings.MAX_ITEM_COUNT,
        # "query": {
        #     "match_all": {}
        # }
        "query" : {
            "filtered" : {
                "query" : {
                    "match_all" : {}
                },
                "filter" : {
                        "terms" : { "media_type" : allowed_media_types}
                }
            }
        }
    }
    res = es.search(index='bibliotheca', body=body)
    response_data = prepare_search_response(res, page)


    return HttpResponse(json.dumps(response_data), content_type="application/json")
    def report_to_elastic(self, file: FileArchive):
        config = file.config
        fullfilename_ftp = file.to.path.replace("\\\\diskstation", '').replace('\\', '/')

        dict = {
            "ext": file.to.get_extension(),  # 'jpg'
            "volume": "/volume2",
            # "/Camera/Foscam/FI9805W_C4D6553DECE1/snap/MDAlarm_20190201-124005.jpg",
            "path": fullfilename_ftp,
            "@timestamp": file.to.get_timestamp_utc(),  # "2019-02-01T11:40:05.000Z",
            "doc": "event",
            "sensor": config.sensor,
            "position": config.position,
            "camera": config.camera,
            "value": file.to.size(),
            "tags": [
                "synology_cameraarchive",
                "python_camera_archiver"
            ]
        }
        json_data = json.dumps(dict, indent=4, sort_keys=True)
        #print('{}@{}'.format(config.camera, file.to.get_timestamp_utc()), json_data)
        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        res = es.index(index="cameraarchive-" + file.to.get_month_id_utc(),
                       doc_type='doc',
                       body=json_data,
                       id='{}@{}'.format(config.camera, file.to.get_timestamp_utc()))
def searchThroughTags(hotel_id, only_overall=True, tags=["cleanliness", "service", "value", "location", "sleep_quality", "rooms"]):
    '''This function is used for search reviews by clicking tags. It will return two class of reviews, one is high score reviews and the other is low score reviews.

       If the only_overall flag set to True, the function will return high over score reviews and low overall score reviews.
       If the flag set to false, the function will return two classes of result based on the scores of each field in the tags.
    '''
    es = Elasticsearch()
    indexName = "reviews_es_index"
    doc_type = "review"
    query_body1 = getQueryTemplate(hotel_id)
    query_body2 = getQueryTemplate(hotel_id)        
    if only_overall:
        query_body1["query"]["bool"]["must"].append({"range":{"ratings.overall":{"gt":3.0}}})
        query_body2["query"]["bool"]["must"].append({"range":{"ratings.overall":{"lt":3.0}}})
    else:
        query_body1["query"]["bool"]["must_not"] = {}
        query_body2["query"]["bool"]["must_not"] = {}
        query_body1["query"]["bool"]["must_not"]["range"] = {}
        query_body2["query"]["bool"]["must_not"]["range"] = {}
        for tag in tags:
            query_body1["query"]["bool"]["should"].append({"range":{"ratings."+tag:{"gt":3.0}}})
            query_body1["query"]["bool"]["must_not"]["range"]["ratings."+tag] = {"lt":3.0}
            query_body2["query"]["bool"]["should"].append({"range":{"ratings."+tag:{"lt":3.0}}})
            query_body2["query"]["bool"]["must_not"]["range"]["ratings."+tag] = {"gt":3.0}
    res1 = es.search(indexName, body=query_body1)["hits"]["hits"]
    res2 = es.search(indexName, body=query_body2)["hits"]["hits"]
    for i in range(len(res1)):
        res1[i] = res1[i]["_source"]
    for i in range(len(res2)):
        res2[i] = res2[i]["_source"]
    return (res1, res2)
    def es_count(self,p_host,p_port,p_index,p_query=None):
        """
        Returns the number of documents that match a query
        The result is the response from elastic search. The value is in the "count" field of the response.

        {p_host}   Elasticsearch server\n
        {p_port}   Port of the es server\n
        {p_index}  Name of the index to query\n
        {p_query}  Query to run\n

        | ${res} = | es count | localhost | 9200 | myIndex |  {"query":{"query_string":{"query": "searched value"}}} |

        ${res} contains the number of docs
        """

        # Es client
        try:
            param = [{'host':p_host,'port':int(p_port)}]
            es = Elasticsearch(param)
        except Exception:
            raise AssertionError("Connexion error on %s:%i",p_host,int(p_port))

        try:
            result = es.count(index=p_index, body=p_query)
        except Exception:
            raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query)

        return result['count']
Example #30
0
def createIndex():
    """This endpoint should be used to index pages of a Mouchak installation.
    FIXME:
    - Endpoint is only accessible from the index page of search service.
    - Does not support cross origin requests.
    - Better name for the function.

    """
    es = Elasticsearch()
    if not es.indices.exists(urlparse(request.form['url']).netloc):
        url = request.form['url']
        if not request.form['url'].endswith('/'):
            url = request.form['url'] + '/'
        try:
            contents = requests.get(url + "pages").json()
            for content in contents:
                es.index(index=urlparse(request.form['url']).netloc,
                         doc_type="html", body=content, id=content['id'])
            response = make_response()
            response.data = "Website indexed."
            return response
        except:
            response = make_response()
            response.status_code = 204
            return response
    else:
        response = make_response()
        response.status_code = 409
        response.data = {"reason": "Index already exists"}
        return response
Example #31
0
def main(args):

    # Specify the arguments.
    parser = argparse.ArgumentParser(
        description='''A tool for finding when a job was running through use of the big data store.''')
    
    parser.add_argument( '-a', '--allocationid', metavar='int', dest='allocation_id', default=-1,
        help='The allocation ID of the job.')
    parser.add_argument( '-j', '--jobid', metavar='int', dest='job_id', default=-1,
        help='The job ID of the job.')
    parser.add_argument( '-s', '--jobidsecondary', metavar='int', dest='job_id_secondary', default=0,
        help='The secondary job ID of the job (default : 0).')
    parser.add_argument( '-t', '--target', metavar='hostname:port', dest='target', default=None, 
        help='An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".')
    parser.add_argument( '-H', '--hostnames', metavar='host', dest='hosts', nargs='*', default=None,
        help='A list of hostnames to filter the results to ')

    args = parser.parse_args()

    # If the target wasn't specified check the environment for the target value, printing help on failure.
    if args.target == None:
        if TARGET_ENV in os.environ:
            args.target = os.environ[TARGET_ENV]
        else:
            parser.print_help()
            print("Missing target, '%s' was not set." % TARGET_ENV)
            return 2

    
    # Open a connection to the elastic cluster, if this fails is wrong on the server.
    es = Elasticsearch(
        args.target, 
        sniff_on_start=True,
        sniff_on_connection_fail=True,
        sniffer_timeout=60
    )

    # Build the query to get the time range.
    should_query='{{"query":{{"bool":{{ "should":[{0}] {1} }} }} }}'
    match_clause= '{{"match":{{"{0}":{1} }} }}'

    if args.allocation_id > 0 :
        tr_query = should_query.format(
            match_clause.format("data.allocation_id", args.allocation_id), "")
    else : 
        tr_query = should_query.format(
            "{0},{1}".format(
                match_clause.format("data.primary_job_id", args.job_id ),
                match_clause.format("data.secondary_job_id", args.job_id_secondary )), 
            ',"minimum_should_match" : 2' )
            
    # Execute the query on the cast-allocation index.
    tr_res = es.search(
        index="cast-allocation",
        body=tr_query
    )
    total_hits = tr_res["hits"]["total"]

    print("Found {0} matches for specified the job.".format(total_hits))
    if total_hits != 1:
        print("This implementation only supports queries where the hit count is equal to 1.")
        return 3


    # TODO make this code more fault tolerant
    hits= deep_get(tr_res, "hits", "hits")
    if len(hits) > 0 :
        tr_data = deep_get( hits[0], "_source", "data")

        date_format= '%Y-%m-%d %H:%M:%S.%f'
        print_format='%Y-%m-%d.%H:%M:%S:%f'
        search_format='"yyyy-MM-dd HH:mm:ss:SSS"'

        start_time=datetime.strptime(tr_data["begin_time"], '%Y-%m-%d %H:%M:%S.%f')
        start_time='{0}'.format(start_time.strftime(print_format)[:-3])

        # If a history is present end_time is end_time, otherwise it's now.
        if "history" in tr_data:
            end_time=datetime.strptime(tr_data["history"]["end_time"], date_format)
            end_time='{0}'.format(end_time.strftime(print_format)[:-3])
        else:
            end_time="Still Running"
        
        print( "\nAllocation ID: {0}".format(tr_data["allocation_id"]))
        print( "Job ID: {0} - {1}".format(tr_data["primary_job_id"], tr_data["secondary_job_id"]))
        print( "Start Time: {0} \n  End Time: {1}\n".format(start_time, end_time))
Example #32
0
class GetTbCon(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR, timeout=30)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR, timeout=30)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("tb_con")
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            # _id = trans_md5(rowkey)
            boo = self.es.exists("tb_con", "sino", rowkey)
            if boo:
                cunzai = cunzai + 1
                map = self.hbase_con.getResultByRowkey("TB_CON_TABLE", rowkey,
                                                       "tb_con", param)
                if not map:
                    continue
                action_list.append({
                    "_op_type": "update",
                    "_index": "tb_con",
                    "_type": "sino",
                    "_id": rowkey,
                    "doc": map,
                })
            else:
                map = self.hbase_con.getResultByRowkey("TB_CON_TABLE", rowkey,
                                                       "tb_con")
                if not map:
                    continue
                action_list.append({
                    "_index": "tb_con",
                    "_type": "sino",
                    "_id": rowkey,
                    "_source": map,
                })
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 30:
                self.es_ping()
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                if len(action_list) > 0:
                    self.commit(action_list)
                start = int(time.time())
                action_list.clear()
                count = 0

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:tb_con,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
 def __init__(self, host, cloud_id, login, password, api_key, posts):
   self.df = posts
   self.es = Elasticsearch(host, cloud_id=cloud_id, 
                           http_auth = (login, password),
                           api_key=api_key,
                           )
Example #34
0
from elasticsearch import Elasticsearch
from flask import Flask, jsonify

es = Elasticsearch("192.168.59.129:9200")

app = Flask(__name__)


@app.route('/', methods=['get', 'post'])
def index():
    # 1,#查看所有的index
    # indexs = es.indices.get("*")
    # print(indexs)
    # print(indexs.keys())

    # 2,添加index
    body = {
        "settings": {
            "number_of_shards": 3,
            "number_of_replicas": 1
        },
        "mappings": {
            "_doc": {
                'properties': {
                    'tno': {
                        'type': 'keyword'
                    },
                    'tname': {
                        'type': "keyword"
                    },
                    'tsex': {
Example #35
0
class Test(BaseTest):
    def init(self):
        self.elasticsearch_url = self.get_elasticsearch_url()
        print("Using elasticsearch: {}".format(self.elasticsearch_url))
        self.es = Elasticsearch([self.elasticsearch_url])
        logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        self.modules_path = os.path.abspath(self.working_dir +
                                            "/../../../../module")

        self.filebeat = os.path.abspath(self.working_dir +
                                        "/../../../../filebeat.test")

        self.index_name = "test-filebeat-modules"

        body = {"transient": {"script.max_compilations_rate": "1000/1m"}}

        self.es.transport.perform_request('PUT',
                                          "/_cluster/settings",
                                          body=body)

    @parameterized.expand(load_fileset_test_cases)
    @unittest.skipIf(
        not INTEGRATION_TESTS,
        "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them."
    )
    @unittest.skipIf(
        os.getenv("TESTING_ENVIRONMENT") == "2x",
        "integration test not available on 2.x")
    def test_fileset_file(self, module, fileset, test_file):
        self.init()

        # generate a minimal configuration
        cfgfile = os.path.join(self.working_dir, "filebeat.yml")
        self.render_config_template(
            template_name="filebeat_modules",
            output=cfgfile,
            index_name=self.index_name,
            elasticsearch_url=self.elasticsearch_url,
        )

        self.run_on_file(module=module,
                         fileset=fileset,
                         test_file=test_file,
                         cfgfile=cfgfile)

    def run_on_file(self, module, fileset, test_file, cfgfile):
        print("Testing {}/{} on {}".format(module, fileset, test_file))

        try:
            self.es.indices.delete(index=self.index_name)
        except:
            pass
        self.wait_until(lambda: not self.es.indices.exists(self.index_name))

        cmd = [
            self.filebeat,
            "-systemTest",
            "-e",
            "-d",
            "*",
            "-once",
            "-c",
            cfgfile,
            "-E",
            "setup.ilm.enabled=false",
            "-modules={}".format(module),
            "-M",
            "{module}.*.enabled=false".format(module=module),
            "-M",
            "{module}.{fileset}.enabled=true".format(module=module,
                                                     fileset=fileset),
            "-M",
            "{module}.{fileset}.var.input=file".format(module=module,
                                                       fileset=fileset),
            "-M",
            "{module}.{fileset}.var.paths=[{test_file}]".format(
                module=module, fileset=fileset, test_file=test_file),
            "-M",
            "*.*.input.close_eof=true",
        ]

        # Based on the convention that if a name contains -json the json format is needed. Currently used for LS.
        if "-json" in test_file:
            cmd.append("-M")
            cmd.append("{module}.{fileset}.var.format=json".format(
                module=module, fileset=fileset))

        output_path = os.path.join(self.working_dir)
        output = open(os.path.join(output_path, "output.log"), "ab")
        output.write(" ".join(cmd) + "\n")

        local_env = os.environ.copy()
        local_env["TZ"] = 'Etc/UTC'

        subprocess.Popen(cmd,
                         env=local_env,
                         stdin=None,
                         stdout=output,
                         stderr=subprocess.STDOUT,
                         bufsize=0).wait()

        # Make sure index exists
        self.wait_until(lambda: self.es.indices.exists(self.index_name))

        self.es.indices.refresh(index=self.index_name)
        # Loads the first 100 events to be checked
        res = self.es.search(index=self.index_name,
                             body={
                                 "query": {
                                     "match_all": {}
                                 },
                                 "size": 100,
                                 "sort": {
                                     "log.offset": {
                                         "order": "asc"
                                     }
                                 }
                             })
        objects = [o["_source"] for o in res["hits"]["hits"]]
        assert len(objects) > 0
        for obj in objects:
            assert obj["event"][
                "module"] == module, "expected event.module={} but got {}".format(
                    module, obj["event"]["module"])

            assert "error" not in obj, "not error expected but got: {}".format(
                obj)

            if (module == "auditd" and fileset == "log") \
                    or (module == "osquery" and fileset == "result"):
                # There are dynamic fields that are not documented.
                pass
            else:
                self.assert_fields_are_documented(obj)

        self._test_expected_events(test_file, objects)

    def _test_expected_events(self, test_file, objects):

        # Generate expected files if GENERATE env variable is set
        if os.getenv("GENERATE"):
            with open(test_file + "-expected.json", 'w') as f:
                # Flatten an cleanup objects
                # This makes sure when generated on different machines / version the expected.json stays the same.
                for k, obj in enumerate(objects):
                    objects[k] = self.flatten_object(obj, {}, "")
                    clean_keys(objects[k])

                json.dump(objects,
                          f,
                          indent=4,
                          separators=(',', ': '),
                          sort_keys=True)

        with open(test_file + "-expected.json", "r") as f:
            expected = json.load(f)

        assert len(expected) == len(
            objects), "expected {} events to compare but got {}".format(
                len(expected), len(objects))

        for ev in expected:
            found = False
            for obj in objects:

                # Flatten objects for easier comparing
                obj = self.flatten_object(obj, {}, "")
                clean_keys(obj)

                if ev == obj:
                    found = True
                    break

            assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format(
                pretty_json(ev), pretty_json(objects))
from elasticsearch import Elasticsearch

es = Elasticsearch('helk-elasticsearch:9200')

doc = {
    "query": {
        "constant_score": {
            "filter": {
                "bool": {
                    "should": [{
                        "match_phrase": {
                            "event_id": "19"
                        }
                    }, {
                        "match_phrase": {
                            "event_id": "20"
                        }
                    }, {
                        "match_phrase": {
                            "event_id": "21"
                        }
                    }]
                }
            }
        }
    }
}

res = es.search(index="logs-endpoint-winevent-*", body=doc)

count = res['hits']['total']['value']
Example #37
0
import requests
import json
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}])

s = requests.Session()

def write_mongo(data_dict):
  try:
    data_id = es.index(index="wiki", doc_type='wiki', body=data_dict)
    print(data_id)
  except Exception as exception:
    print(exception)  

def streaming():
    #get funciton api wikimedia
    req = requests.Request("GET",'https://stream.wikimedia.org/v2/stream/recentchange').prepare()
    
    resp = s.send(req, stream=True)

    for line in resp.iter_lines():
        if line:
            yield str(line, 'utf-8')


def read_stream():

    for line in streaming():
        if line.startswith('data'):
          data_dict = json.loads(line[6:])
          # only show non-bot
Example #38
0
#from hmmlearn import hmm
from sklearn.externals import joblib

token_list = [
    'Comparison', 'Punctuation', 'Whitespace', 'Keyword', 'IdentifierList',
    'DML', 'Multiline', 'Wildcard', 'Parenthesis', 'Identifier', 'Where',
    'Function', 'Single', 'Operator', 'Integer'
]
log_likelihoods = []
es_host = "127.0.0.1"
es_port = "9200"
logs_index = "logs"
attack_query = []
model = joblib.load("sqli-hmm.pkl")

es = Elasticsearch([{'host': es_host, 'port': es_port}])

sql_log_query = {
    "query": {
        "bool": {
            "must": [{
                "match_all": {}
            }, {
                "range": {
                    "@timestamp": {
                        "lte": "now",
                        "gte": "now-1m"
                    }
                }
            }]
        }
Example #39
0
        'django.contrib.auth.password_validation.MinimumLengthValidator',
    },
    {
        'NAME':
        'django.contrib.auth.password_validation.CommonPasswordValidator',
    },
    {
        'NAME':
        'django.contrib.auth.password_validation.NumericPasswordValidator',
    },
]

# Internationalization
# https://docs.djangoproject.com/en/1.9/topics/i18n/

LANGUAGE_CODE = 'en-us'

TIME_ZONE = 'UTC'

USE_I18N = True

USE_L10N = True

USE_TZ = True

from elasticsearch import Elasticsearch, RequestsHttpConnection

ES_CLIENT = Elasticsearch(['http://localhost:9200/'],
                          connection_class=RequestsHttpConnection)
ES_AUTOREFRESH = True
Example #40
0
    GenericTransformer,
)

es_host = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_HOST', 'localhost')
neo_host = os.getenv('CREDENTIALS_NEO4J_PROXY_HOST', 'localhost')

es_port = os.getenv('CREDENTIALS_ELASTICSEARCH_PROXY_PORT', 9200)
neo_port = os.getenv('CREDENTIALS_NEO4J_PROXY_PORT', 7687)
if len(sys.argv) > 1:
    es_host = sys.argv[1]
if len(sys.argv) > 2:
    neo_host = sys.argv[2]

es = Elasticsearch([
    {
        'host': es_host,
        'port': es_port
    },
])

Base = declarative_base()

NEO4J_ENDPOINT = f'bolt://{neo_host}:{neo_port}'

neo4j_endpoint = NEO4J_ENDPOINT

neo4j_user = '******'
neo4j_password = '******'

LOGGER = logging.getLogger(__name__)

Example #41
0
def dashboard_system_bandwidth(runner_config):
    global es
    es_ip = runner_config['dashboard_ip'].split(':')
    es = Elasticsearch([{'host': es_ip[0]}])
    # Create bottlenecks index
    with open(dashboard_dir + 'posca_system_bandwidth_index_pattern.json')\
            as index_pattern:
        doc = json.load(index_pattern)
    res = es.index(index=".kibana",
                   doc_type="index-pattern",
                   id="bottlenecks",
                   body=doc)
    if res['created'] == "True":
        LOG.info("bottlenecks index-pattern has created")
    else:
        LOG.info("bottlenecks index-pattern has existed")

    with open(dashboard_dir + 'posca_system_bandwidth_config.json')\
            as index_config:
        doc = json.load(index_config)
    res = es.index(index=".kibana", doc_type="config", id="4.6.1", body=doc)
    if res['created'] == "True":
        LOG.info("bottlenecks config has created")
    else:
        LOG.info("bottlenecks config has existed")

    # Configure discover panel
    with open(dashboard_dir + 'posca_system_bandwidth_discover.json')\
            as index_discover:
        doc = json.load(index_discover)
    res = es.index(index=".kibana",
                   doc_type="search",
                   id="system_bandwidth",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth search has created")
    else:
        LOG.info("system_bandwidth search has existed")

    # Create testing data in line graph
    with open(dashboard_dir + 'posca_system_bandwidth_line_data.json')\
            as line_data:
        doc = json.load(line_data)
    res = es.index(index=".kibana",
                   doc_type="visualization",
                   id="system_bandwidth_line-date",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth_line-date visualization has created")
    else:
        LOG.info("system_bandwidth_line-date visualization has existed")

    # Create comparison results in line chart
    with open(dashboard_dir + 'posca_system_bandwidth_line_char.json')\
            as line_char:
        doc = json.load(line_char)
    res = es.index(index=".kibana",
                   doc_type="visualization",
                   id="system_bandwidth_line-char",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth_line-char visualization has created")
    else:
        LOG.info("system_bandwidth_line-char visualization has existed")

    # Create local cpu results in line chart
    with open(dashboard_dir + 'posca_system_bandwidth_local_cpu.json')\
            as line_cpu:
        doc = json.load(line_cpu)
    res = es.index(index=".kibana",
                   doc_type="visualization",
                   id="system_bandwidth_local_cpu",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth_local_cpu visualization has created")
    else:
        LOG.info("system_bandwidth_local_cpu visualization has existed")

    # Create monitoring data in table
    with open(dashboard_dir + 'posca_system_bandwidth_terms_data.json')\
            as terms_char:
        doc = json.load(terms_char)
    res = es.index(index=".kibana",
                   doc_type="visualization",
                   id="system_bandwidth_terms_data",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth_terms_data visualization has created")
    else:
        LOG.info("system_bandwidth_terms_data visualization has existed")

    # Create dashboard
    with open(dashboard_dir + 'posca_system_bandwidth_dashboard.json')\
            as dashboard:
        doc = json.load(dashboard)
    res = es.index(index=".kibana",
                   doc_type="dashboard",
                   id="system_bandwidth_dashboard",
                   body=doc)
    if res['created'] == "True":
        LOG.info("system_bandwidth dashboard has created")
    else:
        LOG.info("system_bandwidth dashboard has existed")
Example #42
0
File: es.py Project: SwoJa/ruman
#!/usr/bin/env python
#coding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from elasticsearch import Elasticsearch
from time_utils import *
import pymysql as mysql
import pymysql.cursors
import time

from config import *
from db import get_stock

es214 = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])
es216 = Elasticsearch([{'host': ES_HOST_WEB0, 'port': ES_PORT_WEB0}])

def defaultDatabase():
	conn = mysql.connect(host=SQL_HOST,user=SQL_USER,password=SQL_PASSWD,db=DEFAULT_DB,charset=SQL_CHARSET,cursorclass=pymysql.cursors.DictCursor)
	conn.autocommit(True)
	cur = conn.cursor()
	return cur

def get_stock(id):
	cur = defaultDatabase()
	stocksql = "SELECT * FROM %s WHERE %s = '%s'" %(TABLE_DAY,DAY_ID,id)
	cur.execute(stocksql)
	thing = cur.fetchone()
	dic = {DAY_STOCK_ID:thing[DAY_STOCK_ID],DAY_START_DATE:thing[DAY_START_DATE],DAY_END_DATE:thing[DAY_END_DATE],DAY_INDUSTRY_CODE:thing[DAY_INDUSTRY_CODE]}
	return dic
#Refer to README.md file for a detailed code and execution steps
import elasticsearch
import eland as ed
from elasticsearch import Elasticsearch

es = Elasticsearch(['host_server_name'],
                   http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'),
                   scheme="https",
                   port=443)

#Following syntax is used to create an index into your Elasticsearch
es.indices.create(index="My_First_Index", ignore=400)

#Check or Fetch the created index
df = ed.DataFrame(es, es_index_pattern="mydata")
df

#Deleting the index
es.indices.delete(index="My_First_Index", ignore=400)
# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function, unicode_literals)

from elasticsearch import Elasticsearch, RequestsHttpConnection

es_client = Elasticsearch(
    hosts=['localhost:9200/'],
    connection_class=RequestsHttpConnection
)
Example #45
0
        "elementkey": {"type": "integer"},
        "transactionyear": {"type": "year"},
        "transactionmonth": {"type": "month"},
        "vendor": {"type": "string"}
    }

ES_HOST = {"host": "localhost", "port": 9200}
INDEX_NAME = "sdotparking"
TYPE_NAME = "transaction"
ID_FIELD = "dataid"

es_cred_file = open("/home/chase/.escreds", 'r')
user = es_cred_file.readline().strip()
pswd = es_cred_file.readline().strip()

es = Elasticsearch(hosts = [ES_HOST], http_auth=(user, pswd))


datapath = "/home/chase/projects/sdot_data/data/parking_data"
fname = sys.argv[1]

with open(datapath + "/" + fname, 'r') as f:
    header = f.readline().strip().split(",")
    header = [ token.lower() for token in header ]
    data = [ token.strip().split(",") for token in f.readlines() ]

bulk_data = []
for row in data:
    data_dict = {}
    for i in range(len(row)):
        try:
Example #46
0
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import ConfigParser
from elasticsearch import Elasticsearch
import json
import os
import utils.logger as log
from utils.parser import Parser as conf_parser

LOG = log.Logger(__name__).getLogger()
config = ConfigParser.ConfigParser()
es = Elasticsearch()
dashboard_path = os.path.join(conf_parser.test_dir, "posca",
                              "testcase_dashboard")
dashboard_dir = dashboard_path + "/"


def dashboard_send_data(runner_config, test_data):
    global es
    es_ip = runner_config['dashboard_ip'].split(':')
    es = Elasticsearch([{'host': es_ip[0]}])
    res = es.index(index="bottlenecks",
                   doc_type=test_data["testcase"],
                   body=test_data["data_body"])
    if res['created'] == "False":
        LOG.error("date send to kibana have errors ", test_data["data_body"])
#encoding:utf-8
from datetime import datetime
from image_signature import generate_signature
from collections import Counter
from elasticsearch import Elasticsearch

es_index = "facerecognition"
es = Elasticsearch("0.0.0.0", port=9200)


class SignatureES(object):
    """Elasticsearch driver for image-match

    """
    size = 5

    def __init__(self,
                 es,
                 index='face',
                 doc_type='face',
                 timeout='10s',
                 size=size,
                 distance_low=0.5,
                 distance_high=0.8):
        """Extra setup for Elasticsearch

        Args:
            es (elasticsearch): an instance of the elasticsearch python driver
            index (Optional[string]): a name for the Elasticsearch index (default 'images')
            doc_type (Optional[string]): a name for the document time (default 'image')
            timeout (Optional[int]): how long to wait on an Elasticsearch query, in seconds (default 10)
Example #48
0
#!/usr/bin/env python
import json
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

f = open("masscan.json")

for line in f:
    try:
        print("inserting " + json.loads(line[:-2])['ip'])
    except:
        print("whoops")
        continue
    # ip is unique for "hosts", and "services" indexes all ports
    es.index(index='masscan_hosts',
             doc_type="_doc",
             id=json.loads(line[:-2])['ip'],
             body=line[:-2])
    es.index(index='masscan_services', doc_type="_doc", body=line[:-2])
Example #49
0
import os
import random
import textwrap

from databuilder.extractor.neo4j_dashboard_search_data_extractor import Neo4jDashboardSearchDataExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
from databuilder.extractor.neo4j_extractor import Neo4jExtractor
from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.task.task import DefaultTask
from elasticsearch import Elasticsearch

# set env ES_HOST to override localhost
es = Elasticsearch([
    {
        'host': os.getenv('ES_HOST', 'localhost')
    },
])

# set env NEO4J_HOST to override localhost
NEO4J_ENDPOINT = 'bolt://{}:7687'.format(os.getenv('NEO4J_HOST', 'localhost'))
neo4j_endpoint = NEO4J_ENDPOINT

neo4j_user = '******'
neo4j_password = '******'

DASHBOARD_ES_MAP = textwrap.dedent("""
    {
    "mappings":{
        "dashboard":{
          "properties": {
#Init Albert pipeline
qa_pipeline = pipeline('question-answering',
                       model="ktrapeznikov/albert-xlarge-v2-squad-v2",
                       tokenizer="albert-xlarge-v2",
                       device=0)
question = "What is the capital of the Netherlands?"
context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore."

#initQA = qa_pipeline(question=question, context=context)

#print(initQA)

# Grab Elasticsearch instance
config = {'host': 'mc.ocbe.de', 'port': 9200}
es = Elasticsearch([config])

# test connection
es.ping()

app = Flask(__name__)
workingDir = 'C:\\Users\\Chris\\Documents\\GitHub\\ElasticAlbertFrontend'
ftpFolder = 'C:\\Users\\Chris\\Documents\\GitHub\\ElasticAlbertFrontend\\uploads'


@app.route("/")
def serve_app():
    return render_template('index.html')


@app.route('/upload', methods=['GET', 'POST'])
Example #51
0
          'wind_degrees',
          'wind_gust_kph',
          'wind_gust_mph',
          'wind_kph',
          'wind_mph',
]

for entry in floats:
 try:
   esObject['_source']['current_observation'][entry] = float(esObject['_source']['current_observation'][entry])
 except:
   pass

try:
  esObject['_source']['current_observation']['observation_location']['longitude'] = float(esObject['_source']['current_observation']['observation_location']['longitude'])
  esObject['_source']['current_observation']['observation_location']['latitude']  = float(esObject['_source']['current_observation']['observation_location']['latitude'])
except:
  pass

#making a bulkObject list so it will be easier to do multiple cities later on
bulkObject = []
bulkObject.append(esObject)

es = Elasticsearch([esHost], sniff_on_start=True)
es.indices.create(index=esIndex, body=esIndexSettings, ignore=400)

if len(bulkObject) > 0:
  helpers.bulk(es, bulkObject)

exit()
Example #52
0
class ElasticRetrieval(BaseRetrieval):
    """
        Interfaces with the Elasticsearch API
    """
    def __init__(self,
                 index_name,
                 method,
                 logger=None,
                 use_default_similarity=True,
                 max_results=None,
                 es_instance=None,
                 save_terms=False,
                 multi_match_type=None):
        self.index_name = index_name
        if es_instance:
            self.es = es_instance
        else:
            if cp.Corpus.__class__.__name__ == "ElasticCorpus":
                self.es = cp.Corpus.es
            else:
                self.es = Elasticsearch(timeout=QUERY_TIMEOUT)

        if not cp.Corpus.isIndexOpen(self.index_name):
            try:
                self.es.indices.open(self.index_name)
                time.sleep(10)
            except TransportError as e:
                print(e)

        if max_results:
            self.max_results = max_results
        else:
            self.max_results = MAX_RESULTS_RECALL

        self.method = method  # never used!
        self.logger = logger
        self.last_query = {}
        self.save_terms = save_terms
        self.default_field = "text"
        self.tie_breaker = 0
        if not multi_match_type:
            self.multi_match_type = "best_fields"
        else:
            self.multi_match_type = multi_match_type

    def rewriteQueryAsDSL1(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch.

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        lucene_query = ""

        for token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
            t_boost = token.boost
            t_count = token.count

            if t_boost is None:
                print("NULL! ")
                print(token, token.boost, token.count)
                t_boost = 0
            if t_count is None:
                print("NULL! ")
                print(token, token.boost, token.count)
                t_count = 0

            boost = t_boost * t_count

            if boost == 0.0:
                continue

            bool_val = token.bool or ""

            token_text = token.token
            if " " in token_text:  # if token is a phrase
                token_text = "\"" + token_text + "\""

            lucene_query += "%s%s " % (bool_val, token_text)
            ##            if boost != 1:
            ##                lucene_query+="^%s" %str(boost)

            if boost != 1:
                token_str = token_text + " "
                lucene_query += bool_val + (token_str * int(boost - 1))

            lucene_query = lucene_query.strip()
            lucene_query += " "

        lucene_query = lucene_query.replace("  ", " ")

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {
            "multi_match": {
                "query": lucene_query,
                "type": self.multi_match_type,
                "fields": fields,
                "operator": "or",
            }
        }

        ##        print(dsl_query)

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker

        return dsl_query

    def rewriteQueryAsDSL2(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch. Version 2

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        lucene_query = ""

        for token in structured_query:
            boost = token.boost * token.count
            bool_val = token.bool or ""

            token_text = token.token
            if " " in token_text:  # if token is a phrase
                token_text = "\"" + token_text + "\""

            lucene_query += "%s%s " % (bool_val, token_text)

            if boost != 1:
                token_str = token_text + " "
                lucene_query += bool_val + (token_str * int(boost - 1))

            lucene_query = lucene_query.strip()
            lucene_query += " "

        elastic_query = {"bool": {"should": []}}

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {
            "multi_match": {
                "query": lucene_query,
                "type": self.multi_match_type,
                "fields": fields,
                "operator": "or",
            }
        }

        ##        print(dsl_query)

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker

        return dsl_query

    def rewriteQueryAsDSL(self, structured_query, parameters):
        """
            Creates a DSL query for elasticsearch. Version 3, uses individual "term" and "match" queries

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if isinstance(structured_query,
                      dict) and "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        field_dicts = []

        for token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
            boost = token.boost * token.count
            bool_val = token.bool or ""

            token_text = token.token
            # if " " in token_text:  # if token is a phrase
            #     token_text = "\"" + token_text + "\""

            if boost == 0.0:
                continue

            for field in parameters:
                if " " in token_text:
                    new_dict = {
                        "match_phrase": {
                            field: {
                                "query": token_text,
                                "boost": parameters[field] * boost
                            },
                        }
                    }

                else:
                    new_dict = {
                        "term": {
                            field: {
                                "value": token_text,
                                "boost": parameters[field] * boost
                            },
                        }
                    }

                field_dicts.append(new_dict)

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {"bool": {"should": field_dicts}}

        return dsl_query

    def runQuery(self, structured_query, max_results=None):
        """
            Interfaces with the elasticsearch query API
        """
        if not structured_query or len(structured_query) == 0:
            return []

        if not max_results:
            max_results = self.max_results

        self.last_query = dict(structured_query)
        dsl_query = self.rewriteQueryAsDSL(
            structured_query["structured_query"], [self.default_field])

        res = self.es.search(
            body={"query": dsl_query},
            size=max_results,
            index=self.index_name,
            doc_type=ES_TYPE_DOC,
            request_timeout=QUERY_TIMEOUT,
        )

        structured_query["dsl_query"] = dsl_query
        hits = res["hits"]["hits"]
        ##        print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query))

        ##        if len(hits.scoreDocs) ==0:
        ##            print "Original query:",original_query
        ##            print "Query:", query
        result = []
        for hit in hits:
            metadata = hit["_source"]["metadata"]
            result.append((hit["_score"], metadata))
        return result

    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: StructuredQuery dict, with a "dsl_query" key
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation = None
        retries = 0
        while retries < 1:
            try:
                explanation = self.es.explain(
                    index=self.index_name,
                    doc_type=ES_TYPE_DOC,
                    body={"query": query["dsl_query"]},
                    id=doc_id,
                    request_timeout=QUERY_TIMEOUT,
                )
                break
            except Exception as e:
                ##                logging.error("Exception, retrying...")
                retries += 1

        if retries > 0:
            if retries == 1:
                logging.error(
                    "Retried {} times, failed to retrieve.".format(retries +
                                                                   1))
            else:
                logging.warning("Retried %d times, retrieved successfuly." %
                                (retries + 1))

        formula = StoredFormula()
        if explanation:
            formula.fromElasticExplanation(explanation, self.save_terms)
        return formula
from requests_aws4auth import AWS4Auth
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, SentimentOptions

print('TweetProcessing Lambda Function Running')

cred = boto3.session.Session().get_credentials()

host = ''

awsauth = AWS4Auth(cred.access_key, cred.secret_key, 'us-east-2', 'es', session_token=cred.token)

es = Elasticsearch(
	hosts=[{'host': host, 'port': 443}],
	http_auth=awsauth,
	use_ssl=True,
	verify_certs=True,
	connection_class=RequestsHttpConnection
)

natural_language_understanding = NaturalLanguageUnderstandingV1(
	version='2017-02-27',
	username='******',
	password='******')

def lambda_handler(event, context):
	message = event['Records'][0]['Sns']['Message']
	tweets = json.loads(message)
	try:
		es_count = es.count(index="tweet-index")['count']
	except:
#!/usr/bin/env python

import re
import os
from urllib.parse import urlparse
from urllib.parse import urljoin
import hashlib
import json
import sys

from elasticsearch import Elasticsearch

es= Elasticsearch()

title_re = re.compile(r'([=*]{3,})\n([^\n]+)\n\1\n')


def find_title(rst):
    matches = title_re.findall(rst)]
    if not matches:
        return ''
    else:
        return matches[0][1]


def walk_documentation(path='.', base_url=''):
    path = os.path.realpath(path)
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith('.txt'):
                filepath = os.path.join(dirpath, filename)
Example #55
0
 def __init__(self, host, port):
     url = "%s:%s" % (host, port)
     try:
         self.client = Elasticsearch([url], send_get_body_as="POST")
     except:
         logger.error('elasticsearch cannot connect')
while os.access(lock_file,os.F_OK):
#    logging.write("Waiting a second ...\n")
    time.sleep(1)

# __main__

# We need to have two command-line args: 
# sys.argv[1]: The node name or "cluster"
# sys.argv[2]: The "key" (status, filter_size_in_bytes, etc)

if len(sys.argv) < 3:
    zbx_fail()

# Try to establish a connection to elasticsearch
try:
    conn = Elasticsearch('localhost:9200', sniff_on_start=False)
except Exception, e:
    zbx_fail()

if sys.argv[1] == 'cluster' and sys.argv[2] in clusterkeys_direct:
    nodestats = None
#    now=time.strftime("%Y%m%d-%H:%M:%S")
    if use_cache(clustercache_file):
#	logging.write(str(now) + ": Using cluster cache\n")
        nodestats = shelve.open(clustercache_file)
        nodestats = nodestats['stats']
    else:
#	logging.write(str(now) + ": Generate lockfile and cluster cache\n")
        lock=open (lock_file, "w")
        try:
            nodestats = conn.cluster.stats()
Example #57
0
# coding:utf-8

import datetime
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

from .example1 import Article

client = Elasticsearch({"host": "localhost", "port": 9200})
s = Search(using=client)


def add_article(id_, title, body, tags):
    now = datetime.datetime.utcnow()
    article = Article(meta={'id': id_}, title=title, tags=tags)
    article.body = body
    article.published_from = now
    article.created_at = now
    article.save()
    return article


def init_test_data():
    add_article(2, 'Python is good!', 'Python is good!', ['python'])
    add_article(3, 'Elasticsearch',
                'Distributed, open source search and analytics engine',
                ['elasticsearch'])
    add_article(4, 'Python very quickly', 'Python very quickly', ['python'])
    add_article(5, 'Django', 'Python Web framework', ['python', 'django'])
Example #58
0
from elasticsearch import Elasticsearch, helpers

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from server.config import ConfigClass

logging.basicConfig(
    level=logging.INFO,
    filename=ConfigClass.tag_indexer_logfile,
    format='%(asctime)s - %(levelname)s - %(module)s %(funcName)s : %(message)s'
)

NEXT_FILE = "next_file"

es = Elasticsearch(ConfigClass.es_nodes)


def save_next(next, apikey):
    try:
        with open(NEXT_FILE + apikey, 'w') as f:
            f.write(next)
    except Exception as e:
        logging.error("Unable to write to disk: {0}".format(e))


def get_next(apikey):
    ''' Retrieves the 'next' variable received from VT on the last run'''
    lastrun = None
    if os.path.exists(NEXT_FILE):
        with open(NEXT_FILE) as f:
Example #59
0
from elasticsearch import Elasticsearch
from pprint import pprint
import sys
import numpy as np

es = Elasticsearch('http://172.27.125.139:9200/',
                   timeout=10,
                   retry_on_timeout=True,
                   max_retries=1)

doc = es.get_source(index="state_bills",
                    id='az_49th-3rd-special_SB1010',
                    doc_type="_all")
print(len(doc['bill_document_last']))
sys.exit()

with open('bill_ids.txt') as infile:
    ids = [x.strip('\n') for x in infile]
#
#o = np.zeros((len(ids)))

#for i, id_ in enumerate(ids):
#    doc = None
#    s = 'failed'
#    doc = es.get_source(index="state_bills", id=id_, doc_type="_all")
#    if doc is not None:
#        o[i] = 1
#        s = 'worked'
#
#    print('{}: {}, {}'.format(s, i, id_))
#
from elasticsearch import Elasticsearch
from training.train_d2v import TrainDoc2Vec
from training.training_prefix import makeTrainingPrefix
import os

es_url = os.environ['AC_SIM_ES_URL'] if os.environ.get(
    'AC_SIM_ES_URL') != None else 'localhost:9200'
es = Elasticsearch(es_url)


class TrainingManager:
    def __init__(self, indexName, docType, object):
        self.indexName = indexName
        self.docType = docType
        self.object = object
        if (object.get("domain_id") != None):
            self.searchTerms = {"domain_id": int(object["domain_id"])}
            self.collectionIndexName = "domains_" + object["cluster_id"]
            self.colletionIndexDocType = "domain"
            self.collectionIndexSearchId = int(object["domain_id"])
        elif (object.get("community_id") != None):
            self.searchTerms = {"community_id": int(object["community_id"])}
            self.collectionIndexName = "communities_" + object["cluster_id"]
            self.colletionIndexDocType = "community"
            self.collectionIndexSearchId = int(object["community_id"])
        elif (object.get("group_id")):
            self.searchTerms = {"group_id": int(object["group_id"])}
            self.collectionIndexName = "groups_" + object["cluster_id"]
            self.colletionIndexDocType = "group"
            self.collectionIndexSearchId = int(object["group_id"])
        elif (object.get("post_id")):