def createFeatureMatrix(key,filemodel,tset):
	es = Elasticsearch()
	docmap = {}
	wordcount = -1

	for word in spamwords:
		#print word.rstrip()
		wordcount = wordcount+1
		#doc = {\"fields\": [\"_id\"], \"query\": {\"bool\": {\"must\": [{\"match_phrase\":{\"text\": \"viagra\" }},{\"match\": {\"split\": \"train\"}}]}}}
		if key=="train":
			res = es.search(index="email_index",doc_type="document",body={"fields": ["_id"], "query": {"bool": {"must": [{"match":{"text": word }},{"match": {"split": "train"}}]}}, "size" : 75000})
		else:
			res = es.search(index="email_index",doc_type="document",body={"fields": ["_id"], "query": {"bool": {"must": [{"match":{"text": word }},{"match": {"split": "test"}}]}}, "size" : 75000})
		total = res['hits']['total']
		if total!=0:
			for hit in res['hits']['hits']:
				docid = int(hit['_id'])
				tf = 1
				if docid in docmap:
					print docid
					fmap = docmap[docid]
				else:
					fmap = {}
				fmap[wordcount]=float(tf)
				docmap[int(docid)]=fmap
	createFile(docmap,wordcount,filemodel,tset)
Example #2
0
    def autoComplete(self, query, key, myindex, mysize):
        self.index = myindex
        values = []

        es = Elasticsearch(hosts = [{"host": self.host, "port": self.port}])

        query = '.*' + query + '.*'

        # UPPER case
        res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.upper()}}})
        for doc in res['hits']['hits']:
            values.append(doc['_source'][key])

        # lower case
        res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.lower()}}})
        for doc in res['hits']['hits']:
            values.append(doc['_source'][key])

        # Title case
        res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.title()}}})
        for doc in res['hits']['hits']:
            values.append(doc['_source'][key])

        values = sorted(set(values))    # Remove duplicates and sort

        return values
Example #3
0
def get_all_vars(var_list):
    es = Elasticsearch('https://*****:*****@fbc3032a2a91be69517a70b3d75f4eaa.us-east-1.aws.found.io:9243')
    q = 'viglink'
    qq = {
        'from': 0,
        'size': 10000,
        'query': {
            'query_string': {'query': q}
        }
        , "_source": var_list
    }
    r = es.search(body=qq, scroll='1m', index='products', doc_type='product')
    total = r['hits']['total']
    batches = int(math.ceil(total * 1.0 / qq['size']))
    h = []
    for i in range(0, batches):
        if i == 0:
            r = es.search(body=qq, scroll='1m', index='products', doc_type='product')
            sid = r['_scroll_id']
            hits = r['hits']['hits']
            h.extend(hits)
            print(len(hits))
        else:
            r = es.scroll(scroll_id=sid, scroll='1m')
            sid = r['_scroll_id']
            hits = r['hits']['hits']
            h.extend(hits)
            print(len(hits))
        print("batch: " + str(i))
    hvar = {}
    df = pandas.DataFrame()
    for var in var_list:
        hvar[var] = [x['_source'][var] for x in h]
    df = pandas.DataFrame(hvar)
    return df
Example #4
0
def search(request):
	if request.method == 'POST':
		data = request.POST
		if not data:
			return _error_response(request, "Failed.  No query received")
		query = data['query']
		es = Elasticsearch(['es'])
		result = es.search(index='listing_index', body={'query': {'query_string': {'query': query}}})

		courses_data = result['hits']['hits']
		courses_list = []
		for c in courses_data:
			course = {}
			course['name'] = c['_source']['name']
			course['pk'] = c['_source']['pk']
			course['description'] = c['_source']['description']
			courses_list.append(course)
		#return a list dictionary (each dictionary is a course)
		return JsonResponse(courses_list, safe=False)
	else:
		es = Elasticsearch(['es'])
		result = es.search(index='listing_index', body={'query': {'query_string': {'query': 'calculus'}}, 'size': 10})
		courses_data = result['hits']['hits']
		courses_list = []
		for c in courses_data:
			course = {}
			course['name'] = c['_source']['name']
			course['pk'] = c['_source']['pk']
			course['description'] = c['_source']['description']
			courses_list.append(course)
		return JsonResponse(result, safe=False)

		return JsonResponse({'work': True, 'resp': courses_list}, safe=False)
class ElasticSearchManager(object):

	def __init__(self, index=None, doc_type=None, *args, **kwargs):
		self.index = index
		self.doc_type = doc_type
		self.obj_es = Elasticsearch()

	def search(self, query = None, *args, **kwargs):
		data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}})
		return fetch_source(data['hits']['hits'])

	def get(self, *args, **kwargs):
		data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id'])
		return data['_source']

	def get_list(self, *args, **kwargs):
		data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}})
		return fetch_source(data['hits']['hits'])

	def insert(self, data = None):
		data = json.loads(data)
		data['user_name'] = data['user']['screen_name']
		del data['user']
		del data['entities']
		res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data)
		logger.info("Getting stream:{0}".format(res))

	def delete(self, data = None):
		pass

	def update(self, data = None):
		pass
def query_elastic(string):
    es = Elasticsearch()
    res = es.search(index="documents_analyzed", doc_type="articles", body={"query": {"match": {"_all": string}}})
    tamano=res['hits']['total']
    res = es.search(index="documents_analyzed", doc_type="articles", body={"size" : tamano,"query": {"match": {"_all": string}},"sort": { "date": { "order": "desc" }}})
    res['hits']['hits']#este es un json con los datos de 0 a n
    return res['hits']['hits']
Example #7
0
class ScoreMerge(object):

    def __init__(self):
        self.es = Elasticsearch()
        self.count = 0
        self.category = open("/home/eunsoo/Downloads/tutorial/tutorial/category.txt", "r").read().split()

    def scoreMerge(self):
        self.es.indices.delete(index='merge', ignore=[400, 404])
        for cat in self.category:
            self.count = self.count + 1
            acc_score = 0.0
            post_count = 0
            search_results = self.es.search(index="scoretest", doc_type='categorized', body={"query": { "match": {"category": cat}}})
            if('hits' in search_results):
                #print search_results['hits']['hits']
                for search_result in search_results['hits']['hits']:
                    acc_score = acc_score + search_result['_source']['score']
                    post_count = post_count + 1

                self.es.index(index="mergetest", doc_type="merge", id = cat,
                              body={"category": cat, "acc_score": acc_score, "post_count" : post_count })
                print cat+"successfully merged"

    def displaySortedCategory(self):
        search_results = self.es.search(index="mergetest", doc_type='merge',
                                        body={"sort": {"acc_score": {"order": "desc"}}})

        print search_results
        #print "category / acc_likes / acc_comments_count / acc_score / post_count"
        for search_result in search_results['hits']['hits']:
            print ("%s/%10f/%10f" %(search_result['_source']['category'],search_result['_source']['acc_score'],search_result['_source']['post_count']))
Example #8
0
File: scrap.py Project: yz-/ut
class ElasticCom(object):

    def __init__(self, index, doc_type, hosts='localhost:9200', **kwargs):
        self.index = index
        self.doc_type = doc_type
        self.es = Elasticsearch(hosts=hosts, **kwargs)

    def search_and_export_to_dict(self, *args, **kwargs):
        _id = kwargs.pop('_id', True)
        data_key = kwargs.pop('data_key', kwargs.get('fields')) or '_source'
        kwargs = dict({'index': self.index, 'doc_type': self.doc_type}, **kwargs)
        if kwargs.get('size', None) is None:
            kwargs['size'] = 1
            t = self.es.search(*args, **kwargs)
            kwargs['size'] = t['hits']['total']

        return get_search_hits(self.es.search(*args, **kwargs), _id=_id, data_key=data_key)

    def search_and_export_to_df(self, *args, **kwargs):
        convert_numeric = kwargs.pop('convert_numeric', True)
        convert_dates = kwargs.pop('convert_dates', 'coerce')
        df = pd.DataFrame(self.search_and_export_to_dict(*args, **kwargs))
        if convert_numeric:
            df = df.convert_objects(convert_numeric=convert_numeric, copy=True)
        if convert_dates:
            df = df.convert_objects(convert_dates=convert_dates, copy=True)
        return df
Example #9
0
def search(query_word):
	result = []
	es = Elasticsearch()
	query1 = {"query": {"wildcard": {"name": {"value": "*" + query_word + "*" } } } }
	res = es.search(index="urban", body=query1)

	if res['hits']['total'] == 0:
		res = es.search(index="champ", body=query1)

	if res['hits']['total'] == 0:
		return 0

	ret = res['hits']['hits']

	temp = defaultdict(int)
	for item in ret:
		ids = item['_source']['business_id']
		query2 = {"query":  {"match": {"business_id": ids } } }
		res = es.search(index="my_data", body=query2)

		for item in res['hits']['hits'][0]['_source']['word_freq']:
			temp[item[0]] += item[1]

	words = []
	for item in temp:
		words.append((item,temp[item]))

	tags = make_tags(words, maxsize=80)

	create_tag_image(tags, 'static/cloud_large.jpg', size=(900, 600), fontname='Lobster')
class LogData(object):

    def __init__(self, host=None, port=None, index_name=None, start_time=None, end_time=None, add_query=None):
        """LogData is a class to collect log from Elasticsearch.

        Parameters
        ----------
        host: string, Elasticsearch IP
        index_name: string, index name in Elasticsearch
        start_time: date string, format looks like '2016-03-24T00:00:00'
        end_time: date string, format looks like '2016-03-24T00:00:00'
                  if empty this field, 'now' will use.

        Returns
        -------

        """
        if host:
            self.host = host
        else:
            raise ValueError("Elasticsearch host cannot be empty.")

        if port:
            self.port = port
        else:
            self.port = 9200

        if index_name:
            self.index_name = index_name
        else:
            raise ValueError("index name cannot be empty.")

        if start_time:
            self.start_time = start_time
        else:
            self.start_time = '1987-03-24T00:00:00'

        if end_time:
            self.end_time = end_time
        else:
            self.end_time = 'now'
        self.add_query = add_query

        self.es = Elasticsearch([{'host': self.host, 'port': self.port}])
        self.total, self.browser_ids = self._total_log_browser_ids()

    def _total_log_browser_ids(self):
        search_result = self.es.search(index=self.index_name, body=search_syntax(start=0, size=0,
                                                                                 start_time=self.start_time,
                                                                                 end_time=self.end_time,
                                                                                 add_query=self.add_query))
        return search_result['hits']['total'], [bucket['key'] for bucket in search_result['aggregations']['identity_tag']['buckets']]

    def result(self, start=0, size=0):
        search_result = self.es.search(index=self.index_name, body=search_syntax(start=start, size=size,
                                                                                 start_time=self.start_time,
                                                                                 end_time=self.end_time,
                                                                                 add_query=self.add_query))
        return search_result['hits']['hits']
Example #11
0
class ElasticClient:
    def __init__(self, host: str, port: int):
        try:
            self.es = Elasticsearch(hosts=[
                {'host': host,
                 'port': port}])
            info = self.es.info()
            logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name']))

        except ElasticsearchException as e:
            logger.error("Elasticsearch is not available.", e)
            exit(0)

    def get_articles(self, index, doctype, batch_size):
        query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}'
        result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query)
        articles = result.get('hits').get('hits')
        return articles if articles is not None else []

    def count(self, index):
        return self.es.count(index=index)['count']

    def info(self):
        return self.es.info()

    def check_url(self, url: str, auth_index: str):
        """
        Private function to check if a URL appears in the database.

        Parameters
        ----------

        url: URL for the news stories to be scraped.

        auth_index: es index

        Returns
        -------

        found: Boolean.
                Indicates whether or not a URL was found in the database.
        """
        response = self.es.search(index=auth_index, doc_type=auth_index, body={
            "query":
                {
                    "match_phrase": {
                        "url": url
                    }
                }
        }, size=0, terminate_after=1, ignore_unavailable=True)

        return response["hits"]["total"] > 0

    def persist(self, index, doctype, payload):
        self.es.index(index=index, doc_type=doctype, body=payload)

    def update(self, index, doctype, doc_id, payload):
        self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)
Example #12
0
File: views.py Project: RDsgk/SGK
def search(request):
    #获取查询关键字
    text = request.POST.get('search_content')
    #建立连接
    es = Elasticsearch([{'host':'10.10.20.26','port':9200}])
    #调用es-py API 根据用户名密码查询,限制查询条数<50
    print text
    #定义查询类型,从前台页面获取
    querytype = request.POST.get('way')
    print querytype
    if querytype == 'username' or querytype == 'none':
   	 #基于用户名查询
   	 searchResult = es.search(index='_all',body={"query":{"term":{'username':text}},"highlight":{"fields":{"username":{}}},"size":100})
    elif querytype == 'email':
   	 #基于邮箱查询
   	 searchResult = es.search(index='_all',body={"query":{"term":{'email':text}},"size":50})
    elif querytype == 'passwd':
   	 #基于密码查询  
   	 searchResult = es.search(index='_all',body={"query":{"term":{'password':text}},"size":50})
   # elif querytype == 'multi':
	 # 多字段检索
#	searchResult = es.search(index='_all',body={"multi_match":{"query":text,"fields":["username","email"]}})	
    #搜索结果list
    search_list=[]
    #搜索集合list
    collection_list=[]
    #数据源list
    source_list=[]
    #获取搜索结果
    a= searchResult['hits']
    b=a['hits']
    for i in range( len(b)):
 	c=b[i]['_source']
        print(c)
	t=b[i]['_index']
        print(t)
        print '************************'
	search_list.append(c)
	collection_list.append(t)
    #获取数据源
    for element in collection_list:
	targetSource = es.search(index='collectionlist2',body={"query":{"match":{'collectionName':element}}})
	deal1 = targetSource['hits']
	deal2 = deal1['hits']
	for j in range( len(deal2)):
		deal3 = deal2[j]['_source']
		deal4 = deal3['source']
		source_list.append(deal4)
    #获取查询总条数
    total = a['total']
    #获取查询时间
    took = searchResult['took']
	
    ccc=search_count(user=request.user,content=text)
    ccc.save()
    #返回查询结果和数据源信息
    return render_to_response("newmainpage.html",{"tplfile":"newsearchresult.html","username":request.user,"list":search_list,"source_list":source_list,"total":total,"took":took})
Example #13
0
class ElasticsearchServices:

    def __init__(self):
        self.es = Elasticsearch()
    
    # body should be in json format
    def feed_data(self, index, doc_type, body):
        res = None
        try:
            res = self.es.index(
                index = index,
                doc_type = doc_type,
                body = body
            )
        except:
            for e in sys.exc_info():
                print "Unexpected error:", e
            pass

        return res

    # body should be in json format
    def search(self, index, doc_type, body, size=20):
        res = self.es.search(
            index = index,
            doc_type = doc_type,
            body = body,
            size = size
        )

        return res

    def search_scroll(self, index, doc_type, body, scroll="2m", size=20):
        res = self.es.search(
            index = index,
            doc_type = doc_type,
            scroll = scroll,
            search_type = 'scan',
            size = size,
            body = body
        )

        return res

    def scroll(self, scroll_id, scroll="2m"):
        res = None
        try:
            res = self.es.scroll(scroll_id = scroll_id, scroll = scroll)
        except NotFoundError as e:
            print e
            res = {"hits": {"hits": []}}

        return res

    def get_total_hit(self, res):
        return res['hits']['total']
Example #14
0
class MorelikethisBolt(Bolt):

    connections = {}

    def initialize(self, conf, context):
        host = conf.get('zeit.recommend.elasticsearch.host', 'localhost')
        port = conf.get('zeit.recommend.elasticsearch.port', 9200)
        self.es = Elasticsearch(hosts=[{'host': host, 'port': port}])
        script_path = os.path.dirname(os.path.realpath(__file__))
        raw = open(script_path + '/stopwords.txt', 'r').read()
        self.stopwords = raw.decode('utf-8').split('\n')[3:]

    def recommend(self, paths, top_n=10):
        b = {
            'query': {
                'bool': {
                    'should': [
                        {'ids': {'values': paths, 'boost': 1}},
                        {'match_all': {'boost': 0}}
                        ]
                    }
                }
            }
        items = self.es.search(doc_type='item', body=b, size=len(paths) or 3)
        legacy_get = lambda i: i['_source'].get('body', i['_source']['teaser'])
        hits = [legacy_get(i) for i in items['hits']['hits']]

        b = {
            'query': {
                'more_like_this': {
                    'like_text': ' '.join(hits),
                    'stop_words': self.stopwords
                    }
                }
            }
        items = self.es.search(doc_type='item', body=b, fields='', size=top_n)
        return list(i['_id'] for i in items['hits']['hits'])

    def process(self, tup):
        if tup.stream == 'control':
            action, user = tup.values
            if action == 'connect':
                self.connections[user] = int(time.time())
            elif action == 'disconnect':
                del self.connections[user]

        elif tup.stream == 'default':
            user, paths = tup.values
            if user in self.connections:
                log('[MorelikethisBolt] Incoming: %s' % user)

                recommendations = self.recommend(paths)
                paths = list(set(paths))[:10]

                emit([user, paths, recommendations])
Example #15
0
def newsearch(query_word):
	result = []
	es = Elasticsearch()
	query1 = {"query": {"wildcard": {"name": {"value": "*" + query_word + "*" } } } }
	res = es.search(index="urban", body=query1)

	if res['hits']['total'] == 0:
		res = es.search(index="champ", body=query1)

	if res['hits']['total'] == 0:
		return 0

	ret = res['hits']['hits']

	temp = defaultdict(int)
	items = []
	for item in ret:
		ids = item['_source']['business_id']
		query2 = {"query":  {"match": {"business_id": ids } } }
		res = es.search(index="alchem", body=query2)

		for item in res['hits']['hits'][0]['_source']['word_freq']:
			items.append(item)
			temp[item['text'].encode('utf-8')] += 1

	words = []

	for item in items:
		t = {}
		scale = 1
		if 'sentiment' not in item:
			continue
		elif 'type' in item['sentiment']:
			if item['sentiment']['type'] == 'positive':
				scale = 1.75
				t['color'] = (0,255,0)
			elif item['sentiment']['type'] == 'negative':
				scale = 1.25
				t['color'] = (255,0,0)
		elif item['sentiment'] == 'positive':
			scale = 1.75
			t['color'] = (0,255,0)
		elif item['sentiment'] == 'negative':
			scale = 1.25
			t['color'] = (255,0,0)
		elif item['sentiment'] == 'neutral':
			t['color'] = (0,0,255)
		else:
			t['color'] = (128,128,128)
		t['tag'] = item['text'].encode('utf-8')
		t['size'] = int( math.ceil( temp[item['text']] * float(item['relevance']) * 30 * scale) )
		words.append(t)

	create_tag_image(words, 'static/cloud_large.jpg', size=(900, 600), fontname='Philosopher')
class ElasticWrapper:

    

    def __init__(self):
         self.es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False)

    def insert(self, docs, index, docs_type):
          total = len(docs)
          i = 0
          index_doc = []
          for doc in docs:
              i +=1    
              index_doc.append({"_index":index, "_type":docs_type, "_id":doc["id"], "_source":doc})
              if len(index_doc)==1000:
                  logging.info("indexing %s/%s" % (i, total))
                  helpers.bulk(self.es, index_doc)
                  index_doc = []
          if len(index_doc)!=0:
              logging.info("indexing %s/%s" % (i, total))
              helpers.bulk(self.es, index_doc)

    def get_one_tweet(self):
        res = self.es.search(index=TWEETS, body={"query": {"match_all": {}}})
        result = res['hits']['hits'][0]["_source"]
        return result

    def get_day_tweets(self, day, offset):
        from_date = day.strftime("%Y-%m-%dT00:00:00Z")
        to_date = day.strftime("%Y-%m-%dT23:59:59Z")
        res = self.es.search(index=TWEETS, body={"query": {"range": {"created_at":{"gte":from_date, "lte":to_date}}}, "size":1000, "from":offset})
        result = res['hits']['hits']
        result = list(map((lambda x: x['_source']), result))
        return result

    def tweets_count_for_day(self, day): 
        from_date = day.strftime("%Y-%m-%dT00:00:00Z")
        to_date = day.strftime("%Y-%m-%dT23:59:59Z")
        res = self.es.search(index=TWEETS, body={"query": {"range": {"created_at":{"gte":from_date, "lte":to_date}}}, "size":0})
        result = res['hits']['total']
        return result

    def articles_count_from(self, day): 
        from_date = day.strftime("%Y-%m-%dT00:00:00Z")
        res = self.es.search(index=DW_NEWS, body={"query": {"range": {"created_at":{"gte":from_date}}}, "size":0})
        result = res['hits']['total']
        return result

    def get_articles_from(self, day, offset):
        from_date = day.strftime("%Y-%m-%dT00:00:00Z")
        res = self.es.search(index=DW_NEWS, body={"query": {"range": {"created_at":{"gte":from_date}}}, "size":1000, "from":offset})
        result = res['hits']['hits']
        result = list(map((lambda x: x['_source']), result))
        return result
Example #17
0
class ESHelper(object):
    def __init__(self, host, port):
        self.es = Elasticsearch(host=host, port=port)
        self.logger = logging.getLogger(__name__)

    @staticmethod
    def _build_agg_query(typ):
        """helper method to build aggregation query for sensors"""
        query = config.AVG_QUERY
        query['aggs']['per_day']['aggs'] = {
            'avg_' + config.INDEXES[typ]['avg_field']: {
                'avg': {
                    'field': config.INDEXES[typ]['avg_field']
                }
            }
        }
        return query

    @staticmethod
    def _build_latest_query(typ):
        """helper method to build latest record from an index based on timestamp field"""
        return  {
              "query": {
                "match_all": {}
              },
              "size": 1,
              "sort": [
                {
                  config.INDEXES[typ]['latest_field']: {
                    "order": "desc"
                  }
                }
              ]
            }


    def get_data(self):
        """get aggregated sensor reading data based on the request Data filter parameter"""
        typ = request.args['Data']
        results = self.es.search(index=config.INDEXES[typ]['index'], body=self._build_agg_query(typ))
        data = []
        for bucket in results['aggregations']['per_day']['buckets']:
            dt = datetime.strptime(bucket['key_as_string'], '%Y-%m-%dT%H:%M:%S.%fZ')
            fld = config.INDEXES[typ]['avg_field']
            if bucket['avg_' + fld]:
                data.append({'x': dt.strftime('%Y-%m-%d'),
                             'y': bucket['avg_' + fld]['value']})
        return jsonify({'result': [data, ], 'date': True})

    def get_latest_reading(self, typ):
        """return latest reading from given sensor based on timestamp field"""
        results = self.es.search(index=config.INDEXES[typ]['index'], body=self._build_latest_query(typ))
        return jsonify(results['hits']['hits'][0]['_source'])
Example #18
0
class ElasticSearching:
    def __init__(self):
        self.es = Elasticsearch([{'host':'210.107.192.201','port':9200}])

    def search(self,query,scheme,alpha,beta,gamma):
        #content = query.replace(r"/",',')
        content = query
        token = content.split(' ')
        content = [x for idx,x in enumerate(token) if not idx == 0]
        content = ' '.join(content)

        analyzer = 'my_DFR_analyzer'
        resTitle = self.es.search(index=scheme,doc_type='article',q='title:' + content,analyzer=analyzer,size=1000)
        resAbstract = self.es.search(index=scheme,doc_type='article',q='Abstract:' + content,analyzer=analyzer,size=1000)
        resBody = self.es.search(index=scheme,doc_type='article',q='body:' + content,analyzer=analyzer,size=1000)

        v = pd.DataFrame()
        l = pd.DataFrame()
        for entry in resTitle['hits']['hits']:
            pmcid = entry['_source']['pmcid']
            score = entry['_score']
            l = l.append(pd.DataFrame({'pmcid':[pmcid], 'title':[score]}))

        v = l
        l = pd.DataFrame()
        for entry in resAbstract['hits']['hits']:
            pmcid = entry['_source']['pmcid']
            score = entry['_score']
            l = l.append(pd.DataFrame({'pmcid':[pmcid], 'abstract':[score]}))

        v = pd.merge(v,l,how = 'outer', on =['pmcid'])
        l = pd.DataFrame()
        for entry in resBody['hits']['hits']:
            pmcid = entry['_source']['pmcid']
            score = entry['_score']
            l = l. append(pd.DataFrame({'pmcid':[pmcid], 'body':[score]}))

        v = pd.merge(v,l,how = 'outer', on = ['pmcid'])
        v = v.fillna(0)
        v['score'] = alpha*v['title']+beta*v['abstract']+gamma*v['body']
        return v.ix[:,['pmcid','score']]


    def test(self):
        with open('summary2.csv','rb') as csvfile:
            reader = csv.DictReader(csvfile)
            for idx,item in enumerate(reader):
                query = item['summary'].replace(r"/",',')
                abstractQuery = {"abstract" : query}
                res = self.es.search(index="trec",doc_type="BM25",q=query,analyzer="my_BM25_analyzer",size=100)
                for doc in res['hits']['hits']:
                    text = '{0},{1}'.format(doc['_source']['pmcid'],doc['_score'])
                    print text
Example #19
0
def elasticsearch(query_phrase,index_name):
    #Query Elasticsearch to see what URLs contain this query
    #result = es.search(index="movie_db", body={'query': {'match': {'description': 'CIA'}}})
    from elasticsearch import Elasticsearch
    es = Elasticsearch(['52.88.228.98'])
    #Query Elasticsearch to see what URLs contain this query
    res = es.search(index_name, q=query_phrase)
    #Calculate hits out of the total amount
    Hits=res['hits']['total']
    #Find total amount by query that is a false positive, 100%, wiki search
    total_Hits = (es.search(index_name, q='wiki'))['hits']['total'] #modify later to direct call to get total amount
    #Output
    print( ("Got %d Hits:" % res['hits']['total']) + ' out of total: ' + str(total_Hits))
    return res
Example #20
0
def index(request):
    q_dict =  request.GET
    if q_dict:
        es = Elasticsearch()
        response_list = []
        if 'color' in q_dict:
            color_to_parse = q_dict['color']
            colors = color_to_parse.split('_')
            
            for c in colors:
                try:
                    get_size = es.search(index = c)['hits']['total']
                    result_raw = es.search(index = c, filter_path = ['hits.hits._*'], size = get_size)
                    #print "result_raw"
                    #print result_raw
                    result_list = result_raw["hits"]["hits"]
                except:
                    result_list = []
                for i in result_list:
                    response_list.append(i['_source'])

            response = {"trashList": response_list}
            print JsonResponse(response)
            return JsonResponse(response)
        elif 'route' in q_dict:
            index = q_dict['route'] + 'routing'
            print index
            try:
                result_raw = es.search(index = index, filter_path = ['hits.hits._*'], size = 1)
                result = result_raw['hits']['hits'][0]['_source']
            except:
                result = {}
            #print JsonResponse(response)
            return JsonResponse(result)
        elif 'routelist' in q_dict:
            index = q_dict['routelist'] + 'routinglist'
            print index
            try:
                result_raw = es.search(index = index, filter_path = ['hits.hits._*'], size = 1)
                result = result_raw['hits']['hits'][0]['_source']
            except:
                result = {}
            #print JsonResponse(response)
            return JsonResponse(result)
    else:
        #return HttpResponse("Hello, world. You're at the polls index.")
        template = loader.get_template('part1/index.html')
        #print "view.index requested"
        return HttpResponse(template.render(request))
Example #21
0
def searchByParams(index, doc_type):
    es = Elasticsearch()
    start = request.args.get('from') or 0
    size = request.args.get('size') or 10
    if 'type' in request.args:
        query = es.search(index, doc_type, body={'query':
                                                 {'prefix':
                                                  {request.args['type']:
                                                   request.args['q']}}},
                          from_=start, size=size)
    else:
        query = es.search(index, doc_type, q=request.args.get('q'),
                          default_operator='AND', size=size, from_=start)

    return jsonify(query['hits'])
Example #22
0
def search():
    es = Elasticsearch()

    query1 = {"query": {"match": {"city": "Urbana" } } }
    query2 = {"query": {"match": {"city": "Champaign" } } }
    res1 = es.search(index="business", body=query1, size=300)
    res2 = es.search(index="business", body=query2, size=400)

    print("Got %d Hits:" % res1['hits']['total'])
    print("Got %d Hits:" % res2['hits']['total'])

    urban = []
    champ = []

    for hit in res1['hits']['hits']:
        urban.append(hit['_source']['business_id'])

    for hit in res2['hits']['hits']:
        champ.append(hit['_source']['business_id'])

    reviews = []

    for ids in urban:
        query = {"query": {"match": {"business_id": ids } } }
        res = es.search(index="reviews", body=query)
        count = res['hits']['total']
        #print("Got %d Hits:" % res['hits']['total'])
        res = es.search(index="reviews", body=query, size=count+1)

        for hit in res['hits']['hits']:
            reviews.append(hit['_source'])
    print len(reviews)
    for ids in champ:
        query = {"query": {"match": {"business_id": ids } } }
        res = es.search(index="reviews", body=query)
        count = res['hits']['total']
        #print("Got %d Hits:" % res['hits']['total'])
        res = es.search(index="reviews", body=query, size=count+1)

        for hit in res['hits']['hits']:
            reviews.append(hit['_source'])
    print len(reviews)

    temp = []
    for item in reviews:
        temp.append(item['business_id'])
    print len(temp),len(set(temp))
    save_reviews(reviews)
def get_judge_res(judge_image_dir):
    es = Elasticsearch(esport)
    judge_image_dir = 'judgeresult:' + judge_image_dir
    search_size = 20
    search_offset = 0
    print request.args
    try:
        if 'offset' in request.args:
            search_offset = int(request.args.get('offset'))
        if 'size' in request.args:
            search_size = int(request.args.get('size'))
        res_index = es.search(
            index = judge_image_dir, 
            size = search_size, 
            from_=search_offset
        )
    except:
        del(es)
        return 'Error: index do not exist\n'
    res_lst = []
    for item in res_index['hits']['hits']:
        res_lst.append(item['_source']['file'])
    res_dict = {
        'total' : res_index['hits']['total'],
        'file_list' : res_lst,
        'from_' : search_offset,
        'size' : len(res_index['hits']['hits'])
    }
    json_res = json.dumps(res_dict)
    del(es)
    return json_res
Example #24
0
def show(ctx, path, order):
    router = Router(open(ctx.obj['CONFIG']))
    route = router.match(path)
    logging.debug("Matched route: %s" % route)
    if not route:
        print 'No queries matched'
        return
    es = Elasticsearch(hosts=route.get('elasticsearch_url'))
    request_body = {}
    for non_mandatory_key in ['sort', 'query']:
        value = route.get(non_mandatory_key)
        if value:
            request_body[non_mandatory_key] = value
    if order == 'asc':
        request_body['sort'] = {'@timestamp': 'asc'}
    elif order == 'desc':
        request_body['sort'] = {'@timestamp': 'desc'}
    elif order:
        click.echo("Unknown order format: %s" % order, err=True)
        return 1
    logging.debug("Query: %s" % (request_body,))
    result = es.search(index=route.get('index'), doc_type=None, body=request_body)
    hits = result['hits']['hits']
    template = Template(route.get("format", "{{ __at_timestamp }} {{ message }}"))
    for hit in hits:
        doc = hit['_source']
        doc['__at_timestamp'] = doc.get('@timestamp')
        print template.render(doc)
Example #25
0
def reindex(old_index, new_index, s):
    ''' Function to reindex by scan and scroll combined with a bulk insert.
    old_index is the index to take docs from, new_index is the one the docs go to.
    s is the size of each bulk insert - should set this as high as the RAM
    on the machine you run it on allows.  500-1000 seems reasonable for t2.medium '''
    def create_bulk_insert_string(results, index):
        ret_str = ''
        for hit in results:
            ret_str += '{"create":{"_index":"' + index + '","_type":"variant","_id":"' + hit['_id'] + '"}}\n'
            ret_str += json.dumps(hit) + '\n'
        return ret_str

    es = Elasticsearch('localhost:9200')
    s = es.search(index=old_index, body='{"query": {"match_all": {}}}', search_type='scan', scroll='5m', size=s)
    curr_done = 0

    try:
        while True:  # do this loop until failure
            r = es.scroll(s['_scroll_id'], scroll='5m')
            this_l = [res['_source'] for res in r['hits']['hits']]
            this_str = create_bulk_insert_string(this_l, new_index)
            es.bulk(body=this_str, index=new_index, doc_type='variant')
            curr_done += len(this_l)
    except:
        print('{} documents inserted'.format(curr_done))
Example #26
0
def search(query='', field='q1', _operator='and', sort=[('_score', 'desc'), ('quoted_by', 'desc')],
           _filter={}, size=1000, _id=False):
    es = Elasticsearch([elasticsearch_setting])
    if query:
        es_query = {
            'match': {
                field: {
                    'query': query,
                    'operator': _operator,
                    'minimum_should_match': '85%'
                }
            }
        }
    else:
        es_query = {"match_all": {}}
    body = {
        "query": {
            "filtered": {
                "query": es_query,
                "filter": _filter
            }
        },
        'size': size
    }
    sort_item = _build_sort(sort)
    if sort_item:
        body.update({'sort': sort_item})
    logger.debug(body)
    result = es.search(index='qwerty', body=body, _source=True, timeout=55)
    if _id:
        return (x for x in result['hits']['hits'])
    return (x['_source'] for x in result['hits']['hits'])
Example #27
0
def iter_elastic_query(instance, index, field, subfield=None):
    es = Elasticsearch(instance)

    # initial search
    resp = es.search(index, body={"query": {"match_all": {}}}, scroll='5m')

    scroll_id = resp.get('_scroll_id')
    if scroll_id is None:
        return

    first_run = True
    while True:
        for hit in resp['hits']['hits']:
            s = hit['_source']
            try:
                if subfield is not None:
                    print(s[field][subfield])
                    yield s[field][subfield]
                else:
                    yield s[field]
            except ValueError:
                    logging.warning("Unable to process row: %s" %
                                    str(hit))

        scroll_id = resp.get('_scroll_id')
        # end of scroll
        if scroll_id is None or not resp['hits']['hits']:
            break
Example #28
0
def search_index(index, searchdict, start=0, host='127.0.0.1', port=9200):
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html

    '''
    #print searchdict
    pprint(searchdict)
    #import pdb; pdb.set_trace()
    thisurl = 'http://%s:%s/%s/_search' % (host, port, path)
    r = requests.get(thisurl, data=json.dumps(searchdict), verify=False)
    print r.reason
    '''

    maxcount = 10000
    es = Elasticsearch()
    res = es.search(index=index, body=searchdict, size=maxcount, scroll='1m')

    # hits.total is the total count of matches, but not the amount returned
    #total = res['hits']['total']
    scroll = es.scroll(scroll_id=res['_scroll_id'])
    res['hits']['hits'] += scroll['hits']['hits']

    return res
class CategoryModule(object):

    def __init__(self, index, doc_type, host ='172.19.1.77', user = '******', passwd = '123456', port = 3306):
        self.time_str = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        self.redis_client = PyBfdRedis.newClient('192.168.40.37:26379', 'bfdopen')
        self.es_client = Elasticsearch([{'host':'192.168.61.89','port':9200}], timeout=1000)
        self.mysql_client = InserMysql(host = host , user = user , passwd = passwd, db = db, port = port)
        self.index = index
        self.doc_type = doc_type

    def __topn(self, items, n):
        result = {}
        total = 0
        for item in sorted(items, key=lambda x: x[1], reverse=True)[:n]:
            total += item[1]
            result.update({item[0]:item[1]})
        return (result,total)

  

    def __output2redis(self, key, value):
        if value['detail']:
            print key, PyBfdRedis.set(self.redis_client, key, json.dumps(value, ensure_ascii=False))

    def getNumWithTime(self, from_time, to_time):
        query = {"query":{"filtered":{"query":{"match_all":{}},"filter":{"range":{"hbase_time":{"from": from_time, "to": start_time}}}}},"size":0}

        rest = self.es_client.search(index='%s' %self.index, doc_type='%s' %self.doc_type, body=query, timeout=100000)
        return rest['hits'][]
Example #30
0
    def GET(self):
        es = Elasticsearch(conf['fulltext']['serviceUrl'])
        if web.input(wildcard_query=None).wildcard_query:
            query = {
                "wildcard": {
                    "_all": web.input().query
                }
            }
            self.set_wildcard_query(True)
        else:
            query = {
                "multi_match": {
                    "query": web.input().query,
                    "operator": "and",
                    "fields": ["text", "pageName", "tags"]
                }
            }
            self.set_wildcard_query(False)
        res = es.search(index=conf['fulltext']['indexName'],
                        body={"query": query,
                              "fields": ["pageName", "path", "fsPath", "text"]})
        rows = []
        for a in res['hits']['hits']:
            fields = a['fields']

            fs_path = os.path.normpath('%s/%s.md' % (self.data_dir, fields['path'][0]))
            page_chapters, h1 = extract_description(fs_path)
            rows.append({
                'h1': h1 if h1 else fields['path'][0],
                'file': fields['path'][0],
                'chapters': page_chapters
            })
        values = dict(query=web.input().query, ans=rows)
        return self._render('search.html', values)
Example #31
0
    def collect(self, raw_output_file='data_notDienGiaDung.csv'):
        elastic_client = None
        elastic_client = Elasticsearch(hosts=[{
            'host': '10.3.70.221',
            'port': 9200
        }])
        if elastic_client.ping():
            print('Yay Connected')
        else:
            print('Awww it could not connect!')
        search_object = {
            "size": 10000,
            "query": {
                "bool": {
                    "must_not": [
                        {
                            "multi_match": {
                                "query": "điện tử",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "đèn",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "tivi",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "tủ lạnh",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "máy giặt",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "phòng ngủ",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "phòng bếp",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "nhà bếp",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "điều hoà",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "quạt",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "lò vi sóng",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "samsung",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "sony",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "lg",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "toshiba",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "Sunhouse",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "Kangaroo",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "Bluestone",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "Asanzo",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "gia đình",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "sinh hoạt",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                        {
                            "multi_match": {
                                "query": "tiêu dùng",
                                "fields": ["_all"],
                                "type": "phrase",
                            }
                        },
                    ]
                }
            }
        }
        search_object = json.dumps(search_object)
        # res = elastic_client.search(index=-7823841914345959386)
        res = elastic_client.search(index='urldata_2020_02',
                                    body=search_object)
        data = res['hits']['hits']
        result = pd.DataFrame()

        for item in data:
            result = result.append(item['_source'], ignore_index=True)

        result.to_csv(raw_output_file)
                        "gte": last30.isoformat(),
                        "lte": t.isoformat()
                    }
                }
            }],
            "must_not": [],
            "should": []
        }
    },
    "size": 0,
    "aggs": {
        "dh": {
            "date_histogram": {
                "field": "harvest_date",
                "interval": "day"
            }
        }
    }
}

rv = es.search(**{"index": "stats", "doc_type": "search", "body": stats_query})

min_days = 31

for b in rv["aggregations"]["dh"]["buckets"]:
    min_days = min(
        min_days,
        (t -
         datetime.strptime(b["key_as_string"], "%Y-%m-%dT%H:%M:%S.%fZ")).days)

print(min_days)
Example #33
0
class ElasticSearchSeqSource(base.DataSource):
    """
    Data source which executes arbitrary queries on ElasticSearch

    This is the tabular reader: will return dataframes. Nested return items
    will become dict-like objects in the output.

    Parameters
    ----------
    query: str
       Query to execute. Can either be in Lucene single-line format, or a
       JSON structured query (presented as text)
    qargs: dict
        Further parameters to pass to the query, such as set of indexes to
        consider, filtering, ordering. See
        http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
    es_kwargs: dict
        Settings for the ES connection, e.g., a simple local connection may be
        ``{'host': 'localhost', 'port': 9200}``.
        Other keywords to the Plugin that end up here and are material:

        scroll: str
            how long the query is live for, default ``'100m'``
        size: int
            the paging size when downloading, default 1000.
    metadata: dict
        Extra information for this source.
    """

    container = 'python'

    def __init__(self, query, qargs, es_kwargs, metadata):
        self._query = query
        self._qargs = qargs
        self._scroll = es_kwargs.pop('scroll', '100m')
        self._size = es_kwargs.pop('size', 1000)  # default page size
        self._es_kwargs = es_kwargs
        self._dataframe = None
        self.es = Elasticsearch([es_kwargs])  # maybe should be (more) global?

        super(ElasticSearchSeqSource, self).__init__(container=self.container,
                                                     metadata=metadata)

    def _run_query(self, size=None):
        if size is None:
            size = self._size
        try:
            q = json.loads(self._query)
            if 'query' not in q:
                q = {'query': q}
            s = self.es.search(body=q,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        except (JSONDecodeError, TypeError):
            s = self.es.search(q=self._query,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        sid = s['_scroll_id']
        scroll_size = s['hits']['total']
        while scroll_size > len(s['hits']['hits']):
            page = self.es.scroll(scroll_id=sid, scroll=self._scroll)
            sid = page['_scroll_id']
            s['hits']['hits'].extend(page['hits']['hits'])
        self.es.clear_scroll(scroll_id=sid)
        return s

    def _get_schema(self, retry=2):
        """Get schema from first 10 hits or cached dataframe"""
        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=1,
                           extra_metadata={})

    def _get_partition(self, _):
        """Downloads all data

        ES has a hard maximum of 10000 items to fetch. Otherwise need to
        implement paging, known to ES as "scroll"
        https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search
        """
        results = self._run_query()
        return [r['_source'] for r in results['hits']['hits']]
Example #34
0
#!/usr/bin/python

import codecs
import csv
import sys

from elasticsearch import Elasticsearch

reload(sys)
sys.setdefaultencoding("UTF-8")

keywords = []
with codecs.open(sys.argv[1], 'r', 'utf-8') as data_file:
	data = csv.reader(data_file, delimiter=";")

	for line in data:
		if line:
			keywords.append(line[0])

es = Elasticsearch("https://search-iprice-production-3-f3orjkipgmnoxt4qzf6zuervfu.ap-southeast-1.es.amazonaws.com:443")
query = '{"size": 0, "query" : { "term" : { "masterbrain" : "%s" } }}'

for k in keywords:
	ret = es.search("product_*_20160121", "product", query % k.lower())
	hits = ret['hits']['total']

	print k, ";" , hits

Example #35
0
def all_host(request):
    """
    所有的主机列表,从es中查询所有主机ip,从数据库中获取异常主机ip,根据数据库ip给所有主机加个状态(正常或异常)
    :param request:
    :return:
    """

    # 返回数据 全部主机数 异常主机数
    data = []
    all_total = 0
    unhealth_total = 0
    sys_ips = []

    # 处理异常主机
    ips = []
    items, total = get_unhealth_host()
    if total:
        for item in items:
            ips.append(item['ip'])

    # 先尝试从redis中获取数据
    # try:
    #     cache_data = red.get('all_host')
    #     if cache_data:
    #         data = eval(cache_data)[0]
    #         all_total = eval(cache_data)[1]
    #         unhealth_total = eval(cache_data)[2]
    #         for per_data in data:
    #             sys_ips.append(per_data['ip'])
    #
    #         # 如果数据库有,sys没有
    #         if total:
    #             for item in items:
    #                 if item['ip'] not in sys_ips:
    #                     data.append({'ip': item['ip'], 'name': item['hostname'], 'status': 1})
    #                     unhealth_total += 1
    #                     all_total += 1
    #
    #         return JsonResponse({'code': 200, 'message': 'OK', 'data': data,
    #                              'all_total': all_total, 'unhealth_total': unhealth_total})
    # except Exception as e:
    #     logger.error(e)
    # 根据 syslog 和 时间筛选索引
    today = datetime.now().strftime('%Y%m%d')
    try:
        # 获取所有索引index
        global es_ip, es_port
        MAIN_URL = "http://" + es_ip + ":" + str(es_port)
        # MAIN_URL = "http://192.168.1.243:9200"
        rs = RequestSimulator(MAIN_URL)
        get_data = {'pretty': ''}
        resp = rs.get(url='/_cat/indices',
                      params=get_data,
                      ignore_http_error=True)
        res = resp.read().decode('utf-8')
        lines = res.split('\n')
        lines = lines[:-1]

        indexs = []
        for line in lines:
            temp_list = line.split()
            temp_str = temp_list[2]
            if temp_str.startswith('syslog') and temp_str.endswith(today):
                indexs.append(temp_str)

        # 索引总数
        all_total = len(indexs)

        # 遍历索引取出每个主机名
        # 链接es
        es = Elasticsearch(es_server_ip_port)
        body = {"query": {"match_all": {}}, "size": 1}

        # 异常主机数量
        unhealth_total = 0

        sys_ips = []

        for index in indexs:
            # 从es中读取
            result = es.search(index=index, body=body, ignore_unavailable=True)
            ip = result['hits']['hits'][0]['_source']['type']  # 主机ip
            sys_ips.append(ip)
            name = result['hits']['hits'][0]['_source']['host']
            status = 0  # 0代表主机正常 1代表主机异常
            if ip in ips:
                status = 1
                unhealth_total += 1
            data.append({'ip': ip, 'name': name, 'status': status})

        # 如果数据库有,sys没有
        if total:
            for item in items:
                if item['ip'] not in sys_ips:
                    data.append({
                        'ip': item['ip'],
                        'name': item['hostname'],
                        'status': 1
                    })
                    unhealth_total += 1
                    all_total += 1

        # 将结果存入redis
        # red.setex('all_host', [data, all_total, unhealth_total], 30)

    except Exception as e:
        logger.error(e)

    for per_data in data:
        try:
            if per_data['status'] == 1:
                for per_item in items:
                    if per_item['ip'] == per_data['ip']:
                        info = eval(per_item['info'])
                        per_data['file_error_path'] = info.get(
                            'file_error_path', '')
                        per_data['file_error_hash'] = info.get(
                            'file_error_hash', '')
            obj = BlackboxHost.objects.filter(hostip=per_data['ip'])
            if obj.exists():
                is_protect = obj[0].status
            else:
                is_protect = 1
            per_data['is_protect'] = is_protect
        except Exception as e:
            logger.error(e)
            per_data['status'] = 1
        # is_block=1表示未阻断状态 0表示阻断状态
        per_data['is_block'] = 1

    return JsonResponse({
        'code': 200,
        'message': 'OK',
        'data': data,
        'all_total': all_total,
        'unhealth_total': unhealth_total
    })
Example #36
0
def syslog_incr_count(request):
    """
    每4小时产生了多少条syslog日志
    :param request:
    :return:
    """
    # 返回数据
    result_list = []

    es = Elasticsearch(jc.es_server_ip_port)  # 连接ES

    now = datetime.now()
    now_str = datetime.strftime(now,
                                '%Y-%m-%d %H:%M:%S')  # 2018-03-19 17:28:40

    # 如果是18分钟,取10分;29九分钟取20分;40分钟取40分
    # minute = str((now.minute//10)*10)
    # if minute == '0':
    #     minute = '00'

    # 拼接字符串获取整点(10分。20分,30分...)
    # new_time = now_str[:10] + 'T' + now_str[11:14] + '00:00.000Z'  # es搜索需要的时间格式 2018-03-20T14:00:00.000Z
    return_time = datetime.strptime(
        now_str[:-5] + '00:00',
        '%Y-%m-%d %H:%M:%S')  # 将2018-03-20 14:00:00转为datetime格式

    # pre_time = new_time + '||-12h'  # 前12小时 2018-03-20T14:00:00.000Z||-10m

    # 因为要显示历史记录, 所以再取前面的五条
    for i in range(6):
        # 查询的结束时间  datetime.datetime(2018, 3, 22, 19, 00, 00)
        new_time = return_time - timedelta(hours=4 * i)

        # 查询的起始时间  datetime.datetime(2018, 3, 22, 15, 00, 00)
        pre_time = return_time - timedelta(hours=4 * (i + 1))

        # es搜索需要的时间格式 2018-03-22T19:00:00.000Z es里存的时间是-8小时
        new_time_for = datetime.strftime((new_time - timedelta(hours=8)),
                                         "%Y-%m-%dT%H:%M:%S.000Z")

        # es搜索需要的时间格式 2018-03-20T18:50:00.000Z es里存的时间是-8小时
        pre_time_for = datetime.strftime(pre_time - timedelta(hours=8),
                                         "%Y-%m-%dT%H:%M:%S.000Z")

        body = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "match_phrase": {
                                "_type": "sysLog"
                            }
                        },  # 必须匹配规则
                    ],
                    "filter": {
                        "range": {
                            "@timestamp": {
                                "gte": pre_time_for,
                                "lt": new_time_for
                            }
                        }
                    }  # 时间过滤器
                }
            },
        }
        try:
            result = es.search(
                index=['syslog*'], body=body,
                ignore_unavailable=True)['hits']['total']  # 从es中读取
        except Exception as e:
            logger.error(e)
            result = 0
        per_dict = {}
        per_dict[str(result)] = [
            datetime.strftime(pre_time, '%d日%H时'),
            datetime.strftime(new_time, '%d日%H时')
        ]
        result_list.append(per_dict)

    return JsonResponse({'code': 200, 'total': result_list})
Example #37
0
class Test(BaseTest):
    def init(self):
        self.elasticsearch_url = self.get_elasticsearch_url()
        print("Using elasticsearch: {}".format(self.elasticsearch_url))
        self.es = Elasticsearch([self.elasticsearch_url])
        logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        self.modules_path = os.path.abspath(self.working_dir +
                                            "/../../../../module")

        self.filebeat = os.path.abspath(self.working_dir +
                                        "/../../../../filebeat.test")

        self.index_name = "test-filebeat-modules"

        body = {"transient": {"script.max_compilations_rate": "2000/1m"}}

        self.es.transport.perform_request('PUT',
                                          "/_cluster/settings",
                                          body=body)

    @parameterized.expand(load_fileset_test_cases)
    @unittest.skipIf(
        not INTEGRATION_TESTS,
        "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them."
    )
    @unittest.skipIf(
        os.getenv("TESTING_ENVIRONMENT") == "2x",
        "integration test not available on 2.x")
    def test_fileset_file(self, module, fileset, test_file):
        self.init()

        # generate a minimal configuration
        cfgfile = os.path.join(self.working_dir, "filebeat.yml")
        self.render_config_template(
            template_name="filebeat_modules",
            output=cfgfile,
            index_name=self.index_name,
            elasticsearch_url=self.elasticsearch_url,
        )

        self.run_on_file(module=module,
                         fileset=fileset,
                         test_file=test_file,
                         cfgfile=cfgfile)

    def run_on_file(self, module, fileset, test_file, cfgfile):
        print("Testing {}/{} on {}".format(module, fileset, test_file))

        try:
            self.es.indices.delete(index=self.index_name)
        except:
            pass
        self.wait_until(lambda: not self.es.indices.exists(self.index_name))

        cmd = [
            self.filebeat,
            "-systemTest",
            "-e",
            "-d",
            "*",
            "-once",
            "-c",
            cfgfile,
            "-E",
            "setup.ilm.enabled=false",
            "-modules={}".format(module),
            "-M",
            "{module}.*.enabled=false".format(module=module),
            "-M",
            "{module}.{fileset}.enabled=true".format(module=module,
                                                     fileset=fileset),
            "-M",
            "{module}.{fileset}.var.input=file".format(module=module,
                                                       fileset=fileset),
            "-M",
            "{module}.{fileset}.var.paths=[{test_file}]".format(
                module=module, fileset=fileset, test_file=test_file),
            "-M",
            "*.*.input.close_eof=true",
        ]

        # Based on the convention that if a name contains -json the json format is needed. Currently used for LS.
        if "-json" in test_file:
            cmd.append("-M")
            cmd.append("{module}.{fileset}.var.format=json".format(
                module=module, fileset=fileset))

        output_path = os.path.join(self.working_dir)
        output = open(os.path.join(output_path, "output.log"), "ab")
        output.write(bytes(" ".join(cmd) + "\n", "utf-8"))

        # Use a fixed timezone so results don't vary depending on the environment
        # Don't use UTC to avoid hiding that non-UTC timezones are not being converted as needed,
        # this can happen because UTC uses to be the default timezone in date parsers when no other
        # timezone is specified.
        local_env = os.environ.copy()
        local_env["TZ"] = 'Etc/GMT+2'

        subprocess.Popen(cmd,
                         env=local_env,
                         stdin=None,
                         stdout=output,
                         stderr=subprocess.STDOUT,
                         bufsize=0).wait()

        # Make sure index exists
        self.wait_until(lambda: self.es.indices.exists(self.index_name))

        self.es.indices.refresh(index=self.index_name)
        # Loads the first 100 events to be checked
        res = self.es.search(index=self.index_name,
                             body={
                                 "query": {
                                     "match_all": {}
                                 },
                                 "size": 100,
                                 "sort": {
                                     "log.offset": {
                                         "order": "asc"
                                     }
                                 }
                             })
        objects = [o["_source"] for o in res["hits"]["hits"]]
        assert len(objects) > 0
        for obj in objects:
            assert obj["event"][
                "module"] == module, "expected event.module={} but got {}".format(
                    module, obj["event"]["module"])

            assert "error" not in obj, "not error expected but got: {}".format(
                obj)

            if (module == "auditd" and fileset == "log") \
                    or (module == "osquery" and fileset == "result"):
                # There are dynamic fields that are not documented.
                pass
            else:
                self.assert_fields_are_documented(obj)

        self._test_expected_events(test_file, objects)

    def _test_expected_events(self, test_file, objects):

        # Generate expected files if GENERATE env variable is set
        if os.getenv("GENERATE"):
            with open(test_file + "-expected.json", 'w') as f:
                # Flatten an cleanup objects
                # This makes sure when generated on different machines / version the expected.json stays the same.
                for k, obj in enumerate(objects):
                    objects[k] = self.flatten_object(obj, {}, "")
                    clean_keys(objects[k])

                json.dump(objects,
                          f,
                          indent=4,
                          separators=(',', ': '),
                          sort_keys=True)

        with open(test_file + "-expected.json", "r") as f:
            expected = json.load(f)

        assert len(expected) == len(
            objects), "expected {} events to compare but got {}".format(
                len(expected), len(objects))

        for ev in expected:
            clean_keys(ev)
            found = False
            for obj in objects:

                # Flatten objects for easier comparing
                obj = self.flatten_object(obj, {}, "")
                clean_keys(obj)

                if ev == obj:
                    found = True
                    break

            assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format(
                pretty_json(ev), pretty_json(objects))