def autofocus(parse_test): # Assume that we have already parsed and have a good database. fields = ['tag_group', 'public_tag_name', 'tag_class', 'description', 'tag'] # Try processing 10 times. That's probably enough. # for _ in range(10): process_domains() processed_search = Search(index=f"content_{parse_test.version}").query('match', processed=2) processed_search.execute() num_processed = 0 for hit in processed_search.scan(): # Check that each of these specific cases have some information in the database. for field in fields: assert hit[field] is not None, f"Domain {hit['domain']} missing field {field}." num_processed += processed_search.count()['value'] partly_processed_search = Search(index=f"content_{parse_test.version}").query('match', processed=1) num_processed += partly_processed_search.count() # Count non-generic domains (the domains which should have been processed). non_generic_search = Search(index=f"content_{parse_test.version}").exclude('term', header__keyword='generic') num_non_generic = non_generic_search.count() # Check to see what percentage of the domains have processed. logging.info(f"Processed {num_processed} out of {num_non_generic}.") percent_processed = float(num_processed) / float(num_non_generic) logging.info(f"Processed {percent_processed*100}% of domains.") assert percent_processed >= parse_test.percent_processed, (f"Processed only {percent_processed*100}% " f"of domains, not {parse_test.percent_processed*100}%.")
def queryTopic(topic, days=None, sent=None): s = Search(using=client, index=collectionName) if days and days != "-1": days = int(days) now = int(time.time()) last = now - days * day s = s.filter('range', date={'gte': last, 'lte': now}) if sent and sent != "-1": s = s.filter('term', class_code=int(sent)) q = Q('query_string', query=topic, default_field="topics.keyword") s = s.query(q) if s.count() > 100: total = 100 else: total = s.count() s = s[0:total] results = s.execute() return results
def handle(self, *args, **options): # Инициализация клиента ElasticSearch es = Elasticsearch(settings.ELASTIC_HOST, timeout=settings.ELASTIC_TIMEOUT) # Заявки на изобретения, у которых есть I_43.D query = Q( "match", Document__idObjType=1, ) & ~Q('query_string', query="_exists_:Claim.I_11") & Q( 'query_string', query="_exists_:Claim.I_43.D AND NOT _exists_:Claim.I_43_bul_str") s = Search().using(es).query(query) c = s.count() i = 0 for h in s.scan(): body = h.to_dict() i += 1 print(f"apps: {i}/{c} - {body['Claim']['I_21']} - {h.meta.id}") i_43_d = body['Claim']['I_43.D'][0] bulletin = ClListOfficialBulletinsIp.objects.get(bul_date=i_43_d) bull_str = f"{bulletin.bul_number}/{bulletin.bul_date.year}" body['Claim']['I_43_bul_str'] = bull_str es.index(index=settings.ELASTIC_INDEX_NAME, doc_type='_doc', id=h.meta.id, body=body, request_timeout=30) # Патенты на изобретения, полезные модели, у которых есть I_45.D query = Q( 'query_string', query="_exists_:Patent.I_45.D AND NOT _exists_:Patent.I_45_bul_str" ) s = Search().using(es).query(query) c = s.count() i = 0 for h in s.scan(): body = h.to_dict() i += 1 print( f"pr. docs: {i}/{c} - {body['Patent']['I_21']} - {h.meta.id}") i_45_d = body['Patent']['I_45.D'][len(body['Patent']['I_45.D']) - 1] bulletin = ClListOfficialBulletinsIp.objects.get(bul_date=i_45_d) bull_str = f"{bulletin.bul_number}/{bulletin.bul_date.year}" body['Patent']['I_45_bul_str'] = bull_str es.index(index=settings.ELASTIC_INDEX_NAME, doc_type='_doc', id=h.meta.id, body=body, request_timeout=30) self.stdout.write(self.style.SUCCESS('Finished'))
def stats_processor(request): s = Search(index=CATALOG_INDICES) # res = s.params(search_type="count").aggs.metric( # "distinct_names", "cardinality", field="full_name").execute() return { 'total_declarations': s.count(), 'total_persons': s.count() # res.aggregations.distinct_names.value }
def get(self, request): def dynamics_list(d): res = [0 if d[i] == 0 else int((d[i + 1] - d[i]) / d[i] * 100) for i in range(len(d) - 1)] res.insert(0, 0) return res date_range = [request.query_params.get('date_from') + '-01', request.query_params.get('date_to') + '-01'] date_range = [date(int(a[:4]), int(a[5:7]), 0o01) for a in date_range] type = request.query_params.get('category') s = Search(index='statistic').params(request_timeout=100) tnved = request.query_params.get('tnved') start_tnved_list = int(request.query_params.get('start')) length_tnved_list = int(request.query_params.get('length')) label_list = [] netto_list = [] stoim_list = [] if not tnved: two_tnved_list_request = StatisticDataDocument.search() two_tnved_list_request.aggs.bucket('a', 'terms', field='tnved_two', size=200) result = two_tnved_list_request.execute() tnved_two_distinct = [item.key for item in result.aggregations.a.buckets] tnved_two_distinct.reverse() for i in tnved_two_distinct[start_tnved_list:start_tnved_list + length_tnved_list]: s.query = Q('bool', must=[Q('match', napr=type), Q('match', tnved_two=i), Q('range', period={'gte': date_range[0], 'lt': date_range[1]})]) s.aggs.metric('stoim', 'sum', field='stoim') s.aggs.metric('netto', 'sum', field='netto') result = s[:s.count()].execute().aggregations label_list.append(i) netto_list.append(result['netto']['value']) stoim_list.append(result['stoim']['value']) else: tnved_distinct = [i[tnved_dict[len(tnved) + 2]] for i in StatisticData.objects.filter(**{tnved_dict[len(tnved)]: tnved}).values( tnved_dict[len(tnved) + 2]).distinct()] for i in tnved_distinct[start_tnved_list:start_tnved_list + length_tnved_list]: tnved_query_field = {tnved_dict[len(tnved) + 2]: i} s.query = Q('bool', must=[Q('match', napr=type), Q('match', **tnved_query_field), Q('range', period={'gte': date_range[0], 'lt': date_range[1]})]) s.aggs.metric('stoim', 'sum', field='stoim') s.aggs.metric('netto', 'sum', field='netto') result = s[:s.count()].execute().aggregations label_list.append(i) netto_list.append(result['netto']['value']) stoim_list.append(result['stoim']['value']) context = { 'labels': label_list, 'netto': [netto_list, dynamics_list(netto_list)], 'cost': [stoim_list, dynamics_list(stoim_list)] } return JsonResponse(context)
def _get_es_conn(self): try: kwargs = dict( hosts=['localhost'], port=9200, use_ssl=False, ) CLIENT = Elasticsearch(**kwargs) es_conn = Search(using=CLIENT, index="geonames") es_conn.count() return es_conn except: raise ConnectionError("Error establishing connection with ES container")
def admin_loc(lat, long): geonameid = id_lat_long(lat, long) request1 = 'SELECT admin1_code, admin2_code, admin3_code, admin4_code FROM geoname WHERE "geonameid"=%s' session1 = cluster.execute(request1, [int(geonameid)]) colonne = session1.one() result = [] if colonne.admin4_code is not None: result = [ colonne.admin1_code, colonne.admin2_code, colonne.admin3_code, colonne.admin4_code ] else: if colonne.admin3_code is not None: result = [ colonne.admin1_code, colonne.admin2_code, colonne.admin3_code ] else: if colonne.admin2_code is not None: result = [colonne.admin1_code, colonne.admin2_code] else: result = [colonne.admin1_code] search = Search(index="geoname").using(client) search = search.query('match', admin1_code=result[0]) geoname_tab = [] it = 0 if search.count() > 1000: it = 1000 else: it = search.count() for hit in search[0:it]: coords_1 = (lat, long) coords_2 = (hit.latitude, hit.longitude) geoname_tab.append({ 'geonameid': hit.geonameid, 'asciiname': hit.asciiname, 'latitude': hit.latitude, 'longitude': hit.longitude, 'distance': distance.geodesic(coords_1, coords_2).km }) geoname_tab = sorted(geoname_tab, key=lambda k: k['distance']) return geoname_tab # # for ti in admin_loc(43.82512, 1.72382)[0:10]: # print(ti)
def get_usernames_for_crawl(): ms = MultiSearch(index='populars') q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}}) never_updated = Search().query(q) total = never_updated.count() never_updated = never_updated[0:total] old_updated = Search().query('range', last_update={"lte": "now-2d"}) total = old_updated.count() old_updated = old_updated[0:total] ms = ms.add(never_updated) ms = ms.add(old_updated) responses = ms.execute() for res in responses: for hit in res: yield (hit.username)
def get_articles(w, keywords, index): """ Get articles details from ES. Return a sigle DOI :w: str, concept name :keywords: str, ontology keywords :index: str :return: list of dict """ review_list = [("review", 1), ("feature article", .5) # ("survey", .5) ] should_k = [ Q('multi_match', query=rw[0], fields=['title', "abstract"], boost=rw[1]) for rw in review_list if rw[0] != w ] if keywords: keywords = [kw for kw in keywords.split(',') if kw != w] should_k += [Q('multi_match', query=kw, boost=2) for kw in keywords] query = Q('bool', must=[Q('multi_match', query=w)], should=should_k, minimum_should_match=1 if keywords else None) request = Search(index=index) request = request.query(query) request = request.sort("_score") request = request.source([ 'DOI', 'title', 'URL', 'authors', 'abstract', 'provider', 'provider_id', 'publication_date' ]) request.execute() if request.count() < 10000: results = [hit.to_dict() for hit in request[:request.count()]] else: results = [hit.to_dict() for hit in request[:9999]] for hit in results: hit['DOI'] = get_url(hit['DOI']) if hit.get('DOI') else hit.get('URL') hit['review'] = any(rw[0] in hit.get('title', '').lower() for rw in review_list) return results
def documents_by_text(self, grouped_targets: dict, queries: list, from_index: int, size: int) -> tuple: """ Paginated documents found by text. """ # For pagination/score sorting to work, we need to query all the different corpus indices in the same # Elasticsearch query. # We are using the grouped target approach like search documents by annotations, event though buckets # are inconsequential for text search. indices = self.target_text_document_indices(grouped_targets) indices_argument = ','.join(indices) language_manager = get_language_manager() match_queries = [ to_match_query(language_manager, query) for query in queries ] grouped_queries = self.group_queries_by_operator(match_queries) # A query language restriction, if present, will work automatically via the query text.<language> mapping. es = get_es_conn() search = Search(using=es, index=indices_argument) search = search.source(["title", "language", "source"]) search.query = Q('bool', must=grouped_queries["must"], must_not=grouped_queries["must_not"], should=grouped_queries["should"]) search = search[from_index:from_index + size] count = search.count() documents = [self.map_hit_with_score(hit) for hit in search] return count, documents
def get_company_by_paizhao_name(paizhao_name): companys_search = Search().using(es).index('paizhao').query('match', paizhao_name=paizhao_name) count_num = companys_search.count() print(count_num) companys = companys_search[0:count_num] for one_company in companys: print(one_company.jigouquancheng)
def _getCount(self): client = Elasticsearch() client.indices.refresh(index='gracc.osg.summary*') s = Search(using=client, index='gracc.osg.summary*') \ .filter('range', **{'EndTime': {'from': '2016-01-01', 'to': '2017-01-01'}}) return s.count()
def retrieve_gurus_tweets(self, current_year=True): """ Returns a dictionary containing all tweets for each guru in elasticsearch for a timewindow :return: dictionary containing all tweets for each guru in elasticsearch for a timewindow """ gurus_dict = dict() gurus_entries = Search( using=self.client, index=self.GURUS_INDEX ).source( include=["body.text", "body.user.screen_name", "body.created_at"]) if current_year: gurus_entries = gurus_entries.filter( 'bool', must={ 'query_string': { 'default_field': 'body.created_at', "query": f"""*{gmtime().tm_year}""" } }) for entry in gurus_entries[0:gurus_entries.count()].scan(): user = entry.body["user"]["screen_name"] text = entry.body["text"] if user in gurus_dict: gurus_dict[user].append(text) else: gurus_dict[user] = list() return gurus_dict
def load_filtered_top_ko_mutations_genes(filters, start=0, size=50): """Retrieves top genes according to number of KO mutations and filter them through the tickable options""" # First aggregate over associations s = Search(using=es, doc_type='ko_associations') if 'chr' in filters and len(filters['chr']) > 0 and len(filters['chr']) < 5: s = s.filter(Q('bool', should=[Q({'nested':{'path':'gene', 'query':{'match':{'gene.chr':chrom if len(chrom) > 3 else 'chr%s' % chrom}}}}) for chrom in filters['chr']])) if 'significant' in filters: s = s.filter(Q('range', mac={'gte': 6})) s = s.filter('term', overBonferroni='T') # TODO: change this to permutation once the new indexed scores are in. agg = A("terms", field="gene.id", size='33341') # Need to check ALL GENES for further lists s.aggs.bucket('genes', 'nested', path='gene').bucket('gene_count', agg) # Need to have a NESTED query top_genes = s.execute().aggregations.genes.gene_count.buckets # The KO associations are already retrieved, just need to assign them to the right gene. association_dict = defaultdict(list) for asso in s[0:s.count()].execute().to_dict()['hits']['hits']: association_dict[asso['_source']['gene']['name']].append(asso['_source']) genes = [] for top in top_genes[start:start+size]: id = top['key'] matches = GENE_ID_PATTERN.match(id) if not matches: continue gene = load_gene_by_id(top['key']) gene['n_hits'] = top['doc_count'] gene['ko_associations'] = association_dict[top['key']] genes.append(gene) return genes, len(top_genes)
def get_all(): # This route is a temporary hack for the WMS service. API Key is not real security, just something to try to prevent random bots from getting a response s = Search(using=client, index='imagery', doc_type="metadata") total = s.count() s = s[0:total] response = s.execute() return response.to_dict()
def get_all(): client = Elasticsearch() s = Search(using=client, index='courses') count = s.count() result = s[0:count].execute()['hits']['hits'] result = sorted(result, key=lambda x: int(x["_id"])) return result
def search_all(es_conn, index): responses = Search(using=es_conn, index=INDEX_NAMES[index]).query( "match", code=DEFAULT_QUERIES[index]) count = responses.count() result = responses[0:count].execute().to_dict() res = {'response': result['hits']['hits']} return JsonResponse(res)
def test_record_page(app, db, es, event_queues, full_record): """Test record page views.""" full_record['conceptdoi'] = '10.1234/foo.concept' full_record['conceptrecid'] = 'foo.concept' r = Record.create(full_record) PersistentIdentifier.create( 'recid', '12345', object_type='rec', object_uuid=r.id, status=PIDStatus.REGISTERED) db.session.commit() with app.test_client() as client: record_url = url_for('invenio_records_ui.recid', pid_value='12345') assert client.get(record_url).status_code == 200 process_events(['record-view']) current_search.flush_and_refresh(index='events-stats-record-view') search = Search(using=es, index='events-stats-record-view') assert search.count() == 1 doc = search.execute()[0] assert doc['doi'] == '10.1234/foo.bar' assert doc['conceptdoi'] == '10.1234/foo.concept' assert doc['recid'] == '12345' assert doc['conceptrecid'] == 'foo.concept' assert doc['resource_type'] == {'type': 'publication', 'subtype': 'book'} assert doc['access_right'] == 'open' assert doc['communities'] == ['zenodo'] assert doc['owners'] == [1]
def test_file_download(app, db, es, event_queues, record_with_files_creation): """Test file download views.""" recid, record, _ = record_with_files_creation record['conceptdoi'] = '10.1234/foo.concept' record['conceptrecid'] = 'foo.concept' record.commit() db.session.commit() with app.test_client() as client: file_url = url_for( 'invenio_records_ui.recid_files', pid_value=recid.pid_value, filename='Test.pdf', ) assert client.get(file_url).status_code == 200 process_events(['file-download']) current_search.flush_and_refresh(index='events-stats-file-download') search = Search(using=es, index='events-stats-file-download') assert search.count() == 1 doc = search.execute()[0] assert doc['doi'] == '10.1234/foo.bar' assert doc['conceptdoi'] == '10.1234/foo.concept' assert doc['recid'] == '12345' assert doc['conceptrecid'] == 'foo.concept' assert doc['resource_type'] == {'type': 'publication', 'subtype': 'book'} assert doc['access_right'] == 'open' assert doc['communities'] == ['zenodo'] assert doc['owners'] == [1]
def get_stock_quotes_hist_from_es(symbol: str = 'LPG', es: elasticsearch.Elasticsearch = ELS_CLIENT ): """ Search Elasticsearch for the quotes. :param symbol: :param es: :return: """ init_stock_quotes_hist_idx(es) s = Search(using=es, index="stock_quotes_hist") \ .filter("term", symbol=symbol) \ .sort({"as_of_date": {"order": "asc"}}) \ .params(request_timeout=300) if 0 == s.count(): return else: for hit in s.scan(): yield { "symbol": hit.symbol, "as_of_date": hit.as_of_date, "close": hit.close, "volume": hit.volume }
def read_processed_events(client, start_dt, end_time, index): start_time = start_dt.strftime('%Y-%m-%dT%H:%M:%S') print('Quering for data starting with ' + start_time) s = Search(using=client, index=index) \ .filter('range', ** {'@timestamp': {'gt': start_time, 'lt':end_time}}) s = s[:s.count()] if len(list(s)) == 0: return [] else: print(len(list(s))) try: response = s.execute() if not response.success(): raise except Exception as e: print(e, 'Error accessing Elasticsearch') sys.exit(1) data = [] for entry in response.to_dict()['hits']['hits']: data.append(entry['_source']) return data
def retrieve_elastic(d_end, d_start, ap_mac="", ue_mac="", elastic_host="localhost:9200"): client = Elasticsearch( [elastic_host], scheme="http", port=9200, ) s = Search(using=client, index="probe_clients") # querry_time = 1600 # d_end = datetime.datetime.now() # d_start = d_end - datetime.timedelta(minutes=querry_time) if ap_mac != "": s = s.query("match", ap_mac=ap_mac) if ue_mac != "": s = s.query("match", mac=ue_mac) s = s.filter('range', timestamp={ 'gte': d_start, 'lt': d_end }).sort('-timestamp') total = s.count() s = s[0:total] response = s.execute() # print(len(response)) # for hit in response: # print("ap_mac: {}, mac: {}, rssi: {}, time: {}".format(hit.ap_mac, hit.mac, hit.rssi, hit.timestamp)) return response
async def check_es(request): """检测es链接是否正常""" domain_name = request.json.get("domain") time_out = 300 try: es = Elasticsearch(ES_HOST[0], http_auth=(ES_HOST[1], ES_HOST[2]), timeout=time_out) time_point = time.mktime(datetime.datetime.now().replace( second=0, microsecond=0).timetuple()) lostashindextimestamp = int(time_point - 300) date_obj = datetime.datetime.fromtimestamp( lostashindextimestamp) - datetime.timedelta(hours=8) logstashindex = '%s%s' % ('domainip-', date_obj.strftime("%Y.%m.%d.%H")) filt = Q("match", msecRegion=lostashindextimestamp) & Q( "match", domain=domain_name) s = Search(using=es, index=logstashindex).query(filt) response = s.count() return_code = 0 if response > 0: result = 0 # 已经配置es else: result = 1 # 未配置es except Exception as e: logger.error(f'check es: {e}') return_code = -1 result = -1 # 系统错误 ret = {"es_stat": result, "return_code": return_code} return json(ret)
def es_read(self, log_id, offset): """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :type log_id: str :param offset: the offset start to read log from. :type offset: str """ # Offset is the unique key for sorting logs given log_id. s = Search(using=self.client) \ .query('match_phrase', log_id=log_id) \ .sort('offset') s = s.filter('range', offset={'gt': offset}) logs = [] if s.count() != 0: try: logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: msg = 'Could not read log with log_id: {}, ' \ 'error: {}'.format(log_id, str(e)) self.log.exception(msg) return logs
def search_by_date(time_begin, time_end, article_id=None, agent=None, country=None, city=None, poll_id=None, widget=False): try: client = Elasticsearch([settings.ELASTICSEARCH_URL]) s = Search(using=client, index="userstat") query_list = [] query_list.append( Q('range', timestamp={ 'gte': time_begin, 'lt': time_end })) if agent is not None: query_list.append(Q('match', agent=agent)) if country: query_list.append(Q('match', country=country)) if city: query_list.append(Q('match', city=city)) if poll_id: query_list.append(Q('match', poll=poll_id)) if article_id: query_list.append(Q('match', article=article_id)) if widget: query_list.append(Q('match', widget=widget)) query = reduce(operator.and_, query_list) poll_created_filter = Q('match', poll_created=True) poll_viewed_filter = Q('match', poll_viewed=True) poll_answered_filter = Q('match', poll_answered=True) user_created_filter = Q('match', user_created=True) poll_count = s.query( (query & Q('bool', filter=[poll_created_filter]))).count() view_count = s.query( (query & Q('bool', filter=[poll_viewed_filter]))).count() user_count = s.query( (query & Q('bool', filter=[user_created_filter]))).count() s = s.query((query & Q('bool', filter=[poll_answered_filter]))) answer_count = s.count() a = A('cardinality', field='user') s.aggs.bucket('unique_users', a) s = s.execute() return { 'poll_count': poll_count, 'view_count': view_count, 'answer_count': answer_count, 'user_count': user_count, 'respondents_count': s.aggregations.unique_users['value'] } except: return None
def es_read(self, log_id, offset, metadata): """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :type log_id: str :param offset: the offset start to read log from. :type offset: str :param metadata: log metadata, used for steaming log download. :type metadata: dict """ # Offset is the unique key for sorting logs given log_id. s = Search(using=self.client) \ .query('match_phrase', log_id=log_id) \ .sort('offset') s = s.filter('range', offset={'gt': int(offset)}) max_log_line = s.count() if 'download_logs' in metadata and metadata['download_logs'] and 'max_offset' not in metadata: try: metadata['max_offset'] = s[max_log_line - 1].execute()[-1].offset if max_log_line > 0 else 0 except Exception: self.log.exception('Could not get current log size with log_id: {}'.format(log_id)) logs = [] if max_log_line != 0: try: logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: self.log.exception('Could not read log with log_id: %s, error: %s', log_id, str(e)) return logs
def count(self): """ Returns the number of indexed tweets """ search = Search(using=self.es, index='tweets') search.execute() return search.count()
def query_event_ids(self): es_query = [] es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}}) query = Q({'bool': {'must': es_query}}) s = Search(using = self.Client, index = "winlogbeat-*").query(query) s.source(includes = ['winlog.provider_name', 'winlog.event_id']) count = s.count() print("Count: %d" % (count)) event_ids = {} i = 0 try: for hit in s.scan(): print('%d. %d' % (i, hit.winlog.event_id)) if not hit.winlog.event_id in event_ids: event_ids[hit.winlog.event_id] = 1 print("%s: %d" % (hit.winlog.provider_name, hit.winlog.event_id)) else: event_ids[hit.winlog.event_id] += 1 i += 1 except: traceback.print_exc()
def es_read(self, log_id, offset): """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :type log_id: str :param offset: the offset start to read log from. :type offset: str """ # Offset is the unique key for sorting logs given log_id. s = Search(using=self.client) \ .query('match', log_id=log_id) \ .sort('offset') s = s.filter('range', offset={'gt': offset}) logs = [] if s.count() != 0: try: logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: msg = 'Could not read log with log_id: {}, ' \ 'error: {}'.format(log_id, str(e)) self.log.exception(msg) return logs
def listAllDontCrawled(): linkscrawledList = [] try: s = Search(using=client, index=INDEX).filter("term", crawled=False) count = s.count() results = s[0:count].execute() idtemp = 0 for link in results: print(link.meta.id, link.url, link.type, link.crawled) auxlink = Link() auxlink.id = link.meta.id auxlink.url = link.url auxlink.type = link.type auxlink.crawled = link.crawled auxlink.text = link.text linktmp = searchByIndex(auxlink.id) if not linktmp: pass linktmp.crawled = True linktmp.save() linkscrawledList.append(auxlink) return linkscrawledList except: return linkscrawledList
def get_bug_count(doc_type, key): payload = {'query': {'bool': {'must': [{'exists': {'field': 'bugs.{0}'.format(key)}}]}}, 'from': 0, 'size': 20, 'sort': {'published_from': {'order': 'desc'}}} s = Search(using=es, index='w12scan', doc_type=doc_type).from_dict(payload) res = s.count() return res
def count_inbound_refs( es_client: Any, release_ident: Optional[str] = None, work_ident: Optional[str] = None, openlibrary_work: Optional[str] = None, url: Optional[str] = None, filter_stage: List[str] = [], es_index: str = "fatcat_ref", ) -> int: """ Same parameters as get_inbound_refs(), but returns just a count """ search = Search(using=es_client, index=es_index) if release_ident: search = search.filter("term", target_release_ident=release_ident) elif work_ident: search = search.filter("term", target_work_ident=work_ident) elif openlibrary_work: search = search.filter("term", target_openlibrary_work=openlibrary_work) else: raise ValueError("require a lookup key") if filter_stage: search = search.filter("term", source_stage=filter_stage) return search.count()
def es_read(self, log_id: str, offset: str, metadata: dict) -> list: """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :param offset: the offset start to read log from. :param metadata: log metadata, used for steaming log download. """ # Offset is the unique key for sorting logs given log_id. search = Search(using=self.client).query('match_phrase', log_id=log_id).sort(self.offset_field) search = search.filter('range', **{self.offset_field: {'gt': int(offset)}}) max_log_line = search.count() if 'download_logs' in metadata and metadata['download_logs'] and 'max_offset' not in metadata: try: if max_log_line > 0: metadata['max_offset'] = attrgetter(self.offset_field)( search[max_log_line - 1].execute()[-1] ) else: metadata['max_offset'] = 0 except Exception: self.log.exception('Could not get current log size with log_id: %s', log_id) logs = [] if max_log_line != 0: try: logs = search[self.MAX_LINE_PER_PAGE * self.PAGE : self.MAX_LINE_PER_PAGE].execute() except Exception: self.log.exception('Could not read log with log_id: %s', log_id) return logs
def _getCount(self): client = Elasticsearch() client.indices.refresh(index='gracc.osg.summary*') s = Search(using=client, index='gracc.osg.summary*') \ .filter('range', **{'EndTime': {'from': '2016-01-01', 'to': '2017-01-01'}}) return s.count()
def search_list(request, kd=None): kd = request.GET.get("kd", None) if not kd: raise Http404 else: kd = kd.strip() page = int(request.GET.get("page", "1")) if page > PAGE_MAC_SIZE: page = PAGE_MAC_SIZE elif page < 1: page = 1 spc = request.GET.get("spc", "1") city = request.GET.get("city", u"全国") pn_count = (page - 1) * PER_COUNT tags = {"cities": [], "websites": []} page_size = 1 try: search = Search(using=es, index="tuangou", doc_type="meituan").query("match", title=kd).sort('-@timestamp')[ pn_count:pn_count + PER_COUNT] # s..query(~Q("match", description="beta")) # description字段不含 beta search.aggs.bucket('per_city', 'terms', field='city') # metric('max_lines', 'max', field='lines') search.aggs.bucket('per_website', 'terms', field='website') page_size = search.count() / PER_COUNT + 1 response = search.execute() # print search.count() # for hit in response: # print dir(hit.meta) # ['doc_type', u'id', u'index', u'score', u'sort'] # print dir(hit) for tag in response.aggregations.per_city.buckets: # print tag.key, tag.doc_count tags["cities"].append((tag.key, tag.doc_count)) # print(tag.key, tag.sum_lines.value) for tag in response.aggregations.per_website.buckets: # print tag.key, tag.doc_count tags["websites"].append((tag.key, tag.doc_count)) except: exception = traceback.format_exc() print exception host_search = [u"美食", u"酒店", u"机票", u"火车票", u"汽车票"] if page_size > PAGE_MAC_SIZE: page_size = PAGE_MAC_SIZE ct = dict({ 'kd': kd, 'results': response, "spc": spc, "city": city, "page": page, "page_size": page_size, "host_search": host_search, }) return render(request, 'search/list.html', ct)
def GetCountRecords(client, from_date, to_date, query = None): """ Get the number of records (documents) from a date range :param elasticsearch.client client: The elasticsearch client to use for the search :param str from_date: The from date. It can be lucene date math, such as 'now-1d' for yesterday :param str to_date: The to date. Can also use lucene date math. :query str query: Query string to limit the documents searched. :return: The total documents """ s = Search(using=client, index='gracc.osg.raw-*') \ .filter('range', **{'@timestamp': {'from': from_date, 'to': to_date}}) response = s.count() return response
def test_periodic_summarizer(self): # Check the database for new summary records. client = Elasticsearch() # Refresh the indexes client.indices.refresh(index='gracc.osg.raw*') # Restart the graccsumperiodic service subprocess.call("systemctl restart graccsumperiodic.service", shell=True) # Wait for a bit to make sure the summarizer actually does it's thing time.sleep(60) # Refresh the indexes client.indices.refresh(index='gracc.osg.summary*') time.sleep(60) # Search for the summary records s = Search(using=client, index='gracc.osg.summary*') \ .filter('range', **{'EndTime': {'from': 'now-7d', 'to': 'now'}}) num_sum = s.count() stats = client.cat.indices(index='_all') print stats self.assertGreater(num_sum, 0) # Search for the summary transfer records s = Search(using=client, index='gracc.osg.transfer-summary*') \ .filter('range', **{'StartTime': {'from': 'now-7d', 'to': 'now'}}) num_sum = s.count() self.assertGreater(num_sum, 0)
def test_raw_transfer(self): """ Testing the tester for raw summary data """ # Check the raw indexes for records from the last 7 days client = Elasticsearch() s = Search(using=client, index='gracc.osg-transfer.raw*') \ .filter('range', **{'StartTime': {'from': 'now-7d', 'to': 'now'}}) num_raw = s.count() stats = client.cat.indices(index='_all') print stats self.assertGreater(num_raw, 0)
def portalSearch(expression, start=0, end=25): client = Elasticsearch() ret = {'nodes': [], 'Counts': {}} q = Q("bool", must=[Q('match', _all=expression)]) s = Search(using=client, index="neo4j-inquisite-node", doc_type="Repository,Data").query(q) q_total = s.count() s = s[0:q_total] s = s.highlight_options(require_field_match=False) s = s.highlight('*', fragment_size=45) res = s.execute() data = {} uuids = [] pub_uuids = {} if res: for r in res: d = r.to_dict() if r.meta.doc_type == 'Repository': if int(d['published']) == 0: continue repo_id = r.meta.id ret['nodes'].append({"id": r.meta.id, "type": "Repository", "name": d['name'], "description": d['readme']}) repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id) pub_uuids[repo_id] = repo_uuids else: hits = [] highs = r.meta.highlight.to_dict() for high_field,high_value in highs.items(): hits.append({high_field: high_value}) data[r.meta.id] = {'id': r.meta.id, "hits": hits} uuids.append(r.meta.id) qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id" pub_data = db.run(qString, {"uuids": uuids}) data_max = 0 for checked in pub_data: if data_max >= 32: break; ret['nodes'].append({"id": checked['uuid'], "type": "Data", "repo_id": checked['repo_id'], "repo_name": checked['repo_name'], "hits": data[checked['uuid']]['hits']}) data_max += 1 return ret else: return ret
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] self._build_fields() # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError('_results_number too large') elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue if value.startswith('_histogram.'): # This is a histogram aggregation we want to run, # not a terms aggregation. field_name = value[len('_histogram.'):] if field_name not in self.histogram_fields: continue histogram_type = ( self.all_fields[field_name]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) sig_bucket.bucket( 'histogram_%s' % field_name, histogram_type, field=self.get_field_name(field_name), interval=histogram_intervals[field_name], ) else: sig_bucket.bucket( value, 'terms', field=self.get_field_name(value), size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
for greater, lesser in time_range: print(greater, " ", lesser) s = Search(using=client, index=index_name, extra={ "size": 10000, "timeout": "20m"} ) \ .query("match", type="nginx-access") \ .query(Q("match", tags="us-nginx-access")) \ .query(Q("match", request_url=url)) if(lesser != 3600): s=s.filter('range', **{"response_time": {"gte": greater, "lte": lesser}}) else: s=s.filter('range', **{"response_time": {"gte": greater}}) count=s.count() print("url : " , url , " count : " , count) s.aggs.metric('response_time', 'avg', field='response_time') response = s.execute() avg = response.aggregations.response_time.value print("url : " , url , " average : " , response.aggregations) s.aggs.metric('response_time', 'max', field='response_time') response = s.execute(ignore_cache=True) max_time = response.aggregations.response_time.value print("url : " , url , " max_time : " , max_time) s.aggs.metric('response_time', 'min', field='response_time') response = s.execute(ignore_cache=True) min_time = response.aggregations.response_time.value
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't restrict on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot return it' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't sort on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot sort on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket( value, 'terms', field=field_name, size=self.config.facets_max_number ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=( '_results_number cannot be greater ' 'than 1,000' ) ) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative' ) elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000' ) for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations( params, search, facets_size, histogram_intervals ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error )[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass raise