async def get_invalidWordsES(*, projectName: str = Path(...), fullMatch: Optional[bool] = False, highlight: Optional[List[str]] = Query(['']), showReturn: Optional[List[str]] = Query(['']), searchItemID: Optional[str] = None, searchItem: Optional[str] = None, dateRange: Optional[List[str]] = Query(['', '']), currentPage: int = 1, pageSize: int = 10, operatorFilter: Optional[List[str]] = Query(['']), sourceFilter: Optional[List[str]] = Query([''])): """ 获取无效词列表EsS """ projectId = await findProjectIdFromProjectName( dbPrefix, 'Project', queryDict={'projectName': projectName}, showDict={'_id': 1}) if not projectId: raise HTTPException(status_code=503, detail='projectNotExist') # 页码起始 start = 0 end = 0 # 带搜索的 es索引 (等价于 mongo中的 数据库) _index = f'KWM-{projectId}.InvalidDict'.lower() #print('_index', _index) s = Search() if operatorFilter != ['']: # 存在 categoryFilter 查询 operatorFilter = unquote(operatorFilter[0], 'utf-8').split(',') #queryDict['operator'] = {'$in': operatorFilter} operatorFilter = '\"' + '\" \"'.join(operatorFilter) + '\"' #print('ccc',operatorFilter) q = Q("query_string", query=operatorFilter, fields=['operator']) s = s.query(q) if sourceFilter != ['']: # 存在 sourceFilter 查询 sourceFilter = unquote(sourceFilter[0], 'utf-8').split(',') #queryDict['source'] = {'$in': sourceFilter} sourceFilter = '\"' + '\" \"'.join(sourceFilter) + '\"' q = Q("query_string", query=sourceFilter, fields=['source']) s = s.query(q) if dateRange != ['', '']: dateRange = unquote(dateRange[0], 'utf-8').split(',') #print('dateRange',dateRange) if dateRange != ['', '']: #s = s.query('range',**{'timestamp': {'gte': dateRange[0], 'lt': dateRange[1]}}) # 这种也可以,为了统一Q,使用下面的表达式 r = Q( 'range', **{'modifiedTime': { 'gte': dateRange[0], 'lt': dateRange[1] }}) s = s.query(r) if searchItem: # single不走 es,所以,此处只有 searchItem 。不会有 searchItemID # 有关键词查询 #queryDict['word'] = {'$regex': searchItem, '$options': 'i'} # 查询包含,且不区分大小写 q = Q("multi_match", query=f"{searchItem.strip()}", fields=['word']) s = s.query(q) # 返回哪些字段 if showReturn != ['']: showReturn = unquote(showReturn[0], 'utf-8').split(',') s = s.source(includes=showReturn) else: s = s.source(includes=[]) # 高亮哪些字段 if highlight != ['']: highlight = unquote(highlight[0], 'utf-8').split(',') #print(highlight) s = s.highlight_options(order='score') s = s.highlight_options(pre_tags="<strong>") s = s.highlight_options(post_tags="</strong>") for ele in highlight: # 每一个逐个添加高亮 s = s.highlight(ele) # 返回页码 if currentPage == 0 and pageSize == 0: # 返回所有数据 s = s[0: 10000] # 这里写死了10000, 如果超过,会报错。最好的解决方法是 用 scan,但是 scan 不会排序。后面再解决 else: start = (currentPage - 1) * pageSize end = start + pageSize s = s[start:end] # 执行 try: response = await esRun(s.to_dict(), _index) #s.execute(ignore_cache=True) except Exception as e: print(e) return ({'count': 0, 'content': []}) else: totalCount = response.hits.total.value temp = response.to_dict()['hits']['hits'] result = [] for item in temp: tt = {'_id': {'$oid': item['_id']}} tt.update(item['_source']) if item.get('highlight'): tt.update({'highlight': item['highlight']}) if start >= 0 and end > 0: tt.update({'id': start + 1}) result.append(tt) start = start + 1 #print(result) return ({'count': totalCount, 'content': result})
def _build_query(q, cql): # this would be handled by the AST with the traverse of CQL model op, node = get_next_node(cql.__root__) q.operation = op if isinstance(node, list): query_list = [] for elem in node: op, next_node = get_next_node(elem) if not getattr(next_node, 'between', 0) == 0: property = next_node.between.value.__root__.__root__.property lower = next_node.between.lower.__root__.__root__ upper = next_node.between.upper.__root__.__root__ query_list.append( Q({'range': { f'{property}': { 'gte': lower, 'lte': upper } }})) if not getattr(next_node, '__root__', 0) == 0: scalars = tuple(next_node.__root__.eq.__root__) property = scalars[0].__root__.property value = scalars[1].__root__.__root__ query_list.append(Q({'match': {f'{property}': f'{value}'}})) q.must(query_list) elif not getattr(node, 'between', 0) == 0: property = node.between.value.__root__.__root__.property lower = None if not getattr(node.between.lower, '__root__', 0) == 0: lower = node.between.lower.__root__.__root__ upper = None if not getattr(node.between.upper, '__root__', 0) == 0: upper = node.between.upper.__root__.__root__ query = Q({'range': {f'{property}': {'gte': lower, 'lte': upper}}}) q.must(query) elif not getattr(node, '__root__', 0) == 0: next_op, next_node = get_next_node(node) if not getattr(next_node, 'eq', 0) == 0: scalars = tuple(next_node.eq.__root__) property = scalars[0].__root__.property value = scalars[1].__root__.__root__ query = Q({'match': {f'{property}': f'{value}'}}) q.must(query) elif not getattr(node, 'intersects', 0) == 0: property = node.intersects.__root__[0].__root__.property if property == 'geometry': geom_type = node.intersects.__root__[ 1].__root__.__root__.__root__.type if geom_type.value == 'Polygon': coordinates = node.intersects.__root__[ 1].__root__.__root__.__root__.coordinates coords_list = [ poly_coords.__root__ for poly_coords in coordinates[0] ] filter_ = Q({ 'geo_shape': { 'geometry': { 'shape': { 'type': 'envelope', 'coordinates': get_envelope(coords_list) }, 'relation': 'intersects' } } }) query_all = Q({'match_all': {}}) q.must(query_all) q.filter(filter_) return q.build()
def query(s, q): s.query = Q("query_string", query=q) return s
class Meta: """Configuration for OAI server search.""" default_filter = Q('exists', field='_oai.id')
import time, datetime, threading from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q cipher = '4a:25:b7:26:47:d3:29:67:14:58:e6:48:e2:0a:8e:5f:6d:f1:67:fe:4a:1d:19:cf:30:80:2e:a2:c4:5a:65:77:20:66:28:ef:03:5e:88:b7:c4:fb:ba:8b:e5:c6:1d:b1:f1:94:7a:11:8b:d7:06:be:49:b7:3a:29:7b:7e:23:ae:5f:37:19:ff:6a:24:d3:60:fd:24:ac:96:38:dd:e1:8b:75:7a:80:20:88:80:8d:d3:b0:a9:8e:c6:9b:14:7f:5e:9a:3e:55:75:af:98:36:e7:fc:d5:aa:4e:08:5a:63:f4:60:45:22:95:e1:ca:09:bd:de:c1:13:b0:31:9c:d2:70:84:e3:59:25:f1:32:8a:f0:bb:b3:c8:ff:6a:e3:fd:35:80:11:92:9e:c3:61:c9:35:c4:9f:64:b0:96:66:1c:36:ac:01:9c:74:44:3c:af:9e:fb:e0:f0:a2:3d:ab:d5:03:6d:39:a7:ca:14:41:cf:e0:c1:f3:4b:f1:50:6e:1e:98:62:67:53:12:b9:43:b7:84:21:fe:64:c0:7d:ca:fe:b9:67:9a:8f:39:0b:83:09:ae:50:98:69:13:85:ad:3b:47:c6:47:e0:ec:f2:46:87:ca:83:83:79:fc:1b:de:10:81:83:d4:48:0b:70:82:c5:a9:38:23:9f:6c:04:5b:f4:e6:78:9e:af:2d:24:17:a4:bd:e9:45:7d:81:52:e2:7f:45:d2:2f:b2:ea:d2:a6:4f:a3:e6:59:4b:17:7d:75:6a:04:b1:fe:1c:31:22:a7:0e:07:3b:42:e6:d1:44:ea:dc:62:c4:68:23:83:0c:37:07:15:61:a2:25:8b:af:5c:fe:45:dd:5d:e7:25:db:e9:fb:f6:57:7a:66:71:c0:1b:bd:45:d0:4e:4e:5c:f2:ae:aa:6f:e7:a2:34:c5:5a:6a:19:ff:d0:de:eb:b7:db:58:80:16:fe:07:7a:c2:f1:43:4c:dc:57:4b:d7:a1:9c:66:b9:66:1c:03:7e:9d:81:f8:d6:a6:36:8b:5b:03:f4:a1:48:5e:fd:7b:99:d8:c3:69:99:93:68:0f:7e:61:2f:19:b9:e8:c2:e4:65:1f:fa:57:ff:8f:85:7e:dd:da:a4:df:5e:63:77:50:2a:64:ad:d2:66:5f:35:67:21:ff:07:1e:23:50:5d:0c:48:9f:89:c5:63:93:05:59:71:81:90:79:cd:0b:47:ec:4d:15:6b:df:dd:6a:7a:45:62:16:07:0d:4c:3d:f0:95:5b:01:d5:65:d4:29:a9:78:a1:ad:82:43:30:10:60:f2:07:08:9c:91:f3:ea:81:fa:cf:14:71:ae:eb:de:c4:8e:40:7c:ee:48:97:5a:c3:86:cf:a1:b7:f3:d8:30:50:4d:03:97:b5:30:de:91:44:69:13:77:57:90:11:d5:97:7f:a6:55:7e:19:e6:88:d9:c4:4f:8a:34:5d:53:90:aa:c8:40:23:9a:2f:73:b7:34:f5:af:5e:69:d0:c9:d6:40:b3:80:74:b9:c5:74:8a:4f:d4:18:d9:72:01:d8:ba:86:bb:39:f0:7b:f6:25:81:8b:54:a2:89:84:50:ed:55:6e:ae:75:e2:b7:f5:f2:ab:70:8d:07:d4:0f:8e:4e:25:83:39:27:c4:87:ac:a6:ad:02:1b:e2:16:b1:06:63:0c:0f:dc:b3:6c:d7:9f:29:78:e9:3a:d9:1d:1a:93:77:7d:b6:07:d6:3c:03:9c:16:e4:6e:11:30:38:d4:ea:a4:d5:8e:fd:ad:52:e4:19:3f:91:b1:bb:8e:51:fd:ca:d2:71:c5:fa:ab:f0:8b:44:fd:c5:be:dd:fb:b4:16:5a:e5:48:6f:82:a1:31:74:d8:43:b8:99:a3:da:40:85:7a:a0:38:82:dc:73:06:a5:53:7c:a4:51:df:ea:a1:27:a1:8c:f3:46:30:60:7d:54:09:af:7f:7b:09:90:17:79:0c:0a:d7:29:43:e3:00:1d:0f:71:70:a2:c0:b0:93:46:0f:5b:c1:5b:c7:76:a7:bd:1b:d6:82:0a:a4:cf:c6:f7:0b:19:19:1c:0c:21:df:cd:cd:b8:bf:6a:94:bf:80:28:03:94:a0:75:bc:83:82:1d:97:c8:7d:a7:ea:c0:f7:e4:f6:a3:c0:ba:ad:0b:c6:44:68:44:fe:cf:aa:2a:7d:e0:73:23:89:a3:5a:6b:ca:40:bd:8f:05:cc:1d:67:67:72:0f:83:28:4f:39:6f:4f:0a:72:2b:3e:43:5a:eb:b3:18:ba:c1:1f:fe:26:f9:46:ba:ec:a2:63:fa:fb:22:d4:c7:15:29:3f:8a:82:ea:a2:d9:b0:19:af:02:6d:c2:d0:78:bf:31:8e:80:1d:2c:34:ec:40:60:b9:3b:08:f7:e6:23:ae:dd:70:00:7c:0c:0f:e8:7f:91:30:8f:42:9d:c1:4d:08:22:97:b1:86:a5:84:9a:9a:df:c0:e3:82:ef:26:91:fb:17:5b:3d:b3:4c:73:e6:26:c7:54:d1:23:d1:fc:80:5e:af:f9:a5:e2:09:e6:9a:2e:b8:52:86:1a:d9:0a:3b:7c:77:d7:83:76:6e:e1:54:64:fc:da:98:7c:a2:1f:8a:39:a5:18:ea:20:71:9a:90:43:9f:19:e8:69:5c:08:a9:70:a4:62:74:e2:21:f8:9c:01:69:48:9a:b5:4c:18:2e:df:ca:1f:8c:9e:b7:c1:6a:e2:a4:d2:8f:3b:f1:cb:25:64:6f:37:31:db:9b:0c:5d:70:cd:42:15:0c:c8:37:78:92:19:4b:18:ec:b1:43:82:68:19:76:e3:fd' ip_src = '192.168.2.182' timestamp = 1563843257911 msg_type = 14 es = Elasticsearch('192.168.2.140:9200') s = Search(using=es, index="cipher-*") s = s[0:10000] if msg_type == 12: q = Q('match', layers__kerberos_msg_type=11) & Q( 'match', layers__ip_dst=ip_src) & Q( 'match', layers__kerberos_cipher__keyword=cipher) if msg_type == 14: q = (Q('match', layers__kerberos_msg_type=11) | Q('match', layers__kerberos_msg_type=13)) & Q( 'match', layers__ip_dst=ip_src) & Q( 'match', layers__kerberos_cipher__keyword=cipher) s1 = s.query(q) response = s1.execute() if len(response) != 0: print('normal') else: qtime = Q('range', timestamp={ 'gte': int(timestamp),
def get_filter(self, named_query): dic = self._filters.get(named_query) key, val = next(iter(dic.items())) return Q(key, **val)
def _build_query(self): query = Q() source = ['id'] sort = [] aggregations = {} query_string = None as_list = as_dict = False for action, value in self.steps: if action == 'order_by': for key in value: if key.startswith('-'): sort.append({key[1:]: 'desc'}) else: sort.append(key) elif action == 'values': source.extend(value) as_list, as_dict = True, False elif action == 'values_dict': if value: source.extend(value) as_list, as_dict = False, True elif action == 'query': query &= self._process_queries(value) elif action == 'filter': query &= self._process_filters(value) elif action == 'source': source.extend(value) elif action == 'aggregate': aggregations.update(value) elif action == 'filter_query_string': query_string = value else: raise NotImplementedError(action) # If we have a raw query string we are going to apply all sorts # of boosts and filters to improve relevance scoring. # # We are using the same rules that `search.filters:SearchQueryFilter` # implements to have a single-source of truth for how our # scoring works. from olympia.search.filters import SearchQueryFilter search = Search().query(query) if query_string: search = SearchQueryFilter().apply_search_query( query_string, search) if sort: search = search.sort(*sort) if source: search = search.source(source) body = search.to_dict() # These are manually added for now to simplify a partial port to # elasticsearch-dsl if self.start: body['from'] = self.start if self.stop is not None: body['size'] = self.stop - self.start if aggregations: body['aggs'] = aggregations self.source, self.as_list, self.as_dict = source, as_list, as_dict return body
def get_es_query(self): return [Q(self.operator, **{self.es_field: self.get_value()})]
def count_logs(resources, userid=0): s = Search().extra(size=0) conds = [] for res in resources: conds.append( Q('match', **{'context__' + res._my_subclass + '_id': res.id})) if userid != 0: s = s.query('bool', must=[ Q("range", datetime={ 'gte': 'now-7d', 'lte': 'now-1d' }), Q("match", component="resources"), Q('bool', should=[ Q('match', action='access'), Q('match', action='view') ]), Q('bool', should=conds), Q('match', user_id=userid) ]) else: s = s.query('bool', must=[ Q("range", datetime={ 'gte': 'now-7d', 'lte': 'now-1d' }), Q("match", component="resources"), Q('bool', should=[ Q('match', action='access'), Q('match', action='view') ]), Q('bool', should=conds) ]) return s
def main(): print("") print(" /\ /\ ___ ___ _ __ / /(_)_ __ ___ ") print("/ / \ \/ __|/ _ \ '__/ / | | '_ \ / _ \\") print("\ \_/ /\__ \ __/ | / /__| | | | | __/") print(" \___/ |___/\___|_| \____/_|_| |_|\___| v{}".format( config.VERSION)) print("") print("Author: Chema Garcia (aka sch3m4)") print(" @sch3m4") print(" https://github.com/thiber-org/userline") print("") log = logging.getLogger(config.APP_NAME) log.setLevel(logging.INFO) std = logging.StreamHandler(sys.stdout) std.setLevel(logging.INFO) formatter = logging.Formatter(config.LOG_FORMAT) std.setFormatter(formatter) log.addHandler(std) parser = argparse.ArgumentParser() required = parser.add_argument_group('Required arguments') required.add_argument( "-H", "--eshosts", help= "Single or comma separated list of ElasticSearch hosts to query (default: localhost)", default=defaults.ES_HOSTS) required.add_argument("-S", "--pool-size", help="Connection pool size (default: {})".format( defaults.ES_POOL_SIZE), type=int, default=defaults.ES_POOL_SIZE) required.add_argument("-i", "--index", help="Index name/pattern", required=True) aux = parser.add_argument_group('Actions') action = aux.add_mutually_exclusive_group(required=True) action.add_argument("-L", "--last-shutdown", help="Gets last shutdown data", action='store_true', default=False) action.add_argument("-E", "--last-event", help="Gets last event data", action='store_true', default=False) action.add_argument("-l", "--logons", help="Shows user logon activity", action='store_true', default=False) action.add_argument("-w", "--who-was-at", help="Shows only logged on users at a given time", metavar="DATE") output = parser.add_argument_group('Output') output.add_argument("-c", "--csv-output", help="CSV Output file", type=argparse.FileType('w'), metavar="PATH") output.add_argument( "-n", "--neo4j", help="Neo4j bolt with auth (format: bolt://user:pass@host:port)", metavar="BOLT") csvout = parser.add_argument_group('CSV options') csvout.add_argument("-F", "--disable-timeframe", help="Do not create timeframe entries", action='store_true', default=False) neoargs = parser.add_argument_group('Neo4J options') neoargs.add_argument( "-f", "--neo4j-full-info", help="Saves full logon/logoff info in Neo4j relations", action='store_true', default=False) neoargs.add_argument("-s", "--unique-logon-rels", help="Sets unique logon relations", action='store_true', default=False) optional = parser.add_argument_group('Optional filtering arguments') optional.add_argument( "-t", "--min-date", help="Searches since specified date (default: {})".format( defaults.MIN_DATE), default=defaults.MIN_DATE) optional.add_argument( "-T", "--max-date", help="Searches up to specified date (default: {})".format( defaults.MAX_DATE), default=defaults.MAX_DATE) optional.add_argument("-p", "--pattern", help="Includes pattern in search") optional.add_argument( "-I", "--include-local", help="Includes local services logons (default: Excluded)", action='store_true', default=False) optional.add_argument( "-k", "--include-locks", help="Includes workstation/screensaver lock events (default: Excluded)", action='store_true', default=False) optional.add_argument("-v", "--verbose", help="Enables verbose mode", action='store_true', default=False) extrainfo = parser.add_argument_group('Extra information') extrainfo.add_argument("-m", "--mark-if-logged-at", help="Marks logged in users at a given time", metavar="DATETIME") args = parser.parse_args() if args.last_event is False and args.logons is False and args.who_was_at is None and args.last_shutdown is False: log.critical("You need to specify at least one action argument") return if args.verbose is True: log.setLevel(logging.DEBUG) std.setLevel(logging.DEBUG) try: mindate = int(dateparser.parse(args.min_date).timestamp() * 10**3) maxdate = int(dateparser.parse(args.max_date).timestamp() * 10**3) if args.who_was_at is not None: whowasat = int( dateparser.parse(args.who_was_at).timestamp() * 10**3) if args.mark_if_logged_at is not None: whowasat = int( dateparser.parse(args.mark_if_logged_at).timestamp() * 10**3) except Exception as exc: log.critical("Error parsing date: {}".format(exc)) return # setup elasticsearch connections.create_connection(hosts=args.eshosts.split(','), maxsize=args.pool_size) conn = connections.get_connection() # shows last shutdown if args.last_shutdown is True: aux = utils.get_last_shutdown(args.index, maxdate) if aux is not None: log.info("Last shutdown:") for k in aux.keys(): item = aux[k] evt = utils.build_event_from_source(item) lastraw = utils.get_last_event(args.index, evt['computer']) lastevt = utils.build_event_from_source(lastraw) uptime = timedelta( microseconds=(lastevt['timestamp'] - evt['timestamp']) * 10**3) log.info("Computer: {}".format(evt['computer'])) log.info("\t- Datetime: {}".format(evt['datetime'])) log.info("\t- Uptime: {}".format(uptime)) log.info("\t- EvtIndex: {}".format(evt['index'])) log.info("\t- EvtId: {}".format(evt['sourceid'])) else: log.info("No shutdown found") return # shows last stored event if args.last_event is True: aux = utils.get_last_event(args.index) if aux is not None: log.info("Last event:") for k in aux.keys(): lastevt = utils.build_event_from_source(aux[k]) log.info("Computer: {}".format(lastevt['computer'])) log.info(json.dumps(lastevt, sort_keys=True, indent=4)) else: log.info("No events found") return # we need an output format if args.csv_output is None and args.neo4j is None: log.critical("This option requires CSV/Neo4J output") return csv = None if args.csv_output is not None: csv = CSV(args.csv_output) if args.mark_if_logged_at is None: csv.disable_mark() neo = None if args.neo4j is not None: neo = Neo4J(args.neo4j) log.info("Building query") # Look for first required events q = Q('match', data_type='windows:evtx:record') & utils.get_dsl_logon_query( args.include_locks) if args.pattern is not None: q = q & Q('query_string', query=args.pattern, analyze_wildcard=True) s = Search(using=conn, index=args.index).query(q).filter('range', datetime={ 'gte': mindate, 'lte': maxdate }).sort('datetime') log.debug("Getting events count") total = s.execute().hits.total log.info("Found {} events to be processed".format(total)) # timeframe if total > 0 and csv is not None and args.disable_timeframe is False: frame = dict(config.EVENT_STRUCT) for k in frame.keys(): frame[k] = "-" * 10 frame[config.CSV_FIELDS[0]] = "TIMEFRAME/START" frame['logon.datetime'] = args.min_date frame['logon.timestamp'] = mindate frame['logoff.datetime'] = args.min_date frame['logoff.timestamp'] = mindate csv.add_sequence(frame) count = 0 proglen = 0 progress = 0 begin = time.time() log.info("Processing events") for hit in s.scan(): login = utils.build_event_from_source(hit) log.debug("Got logon event: {}".format(login['id'])) duration = '' logout = None # local service check if args.include_local is False and ( \ login['domain'] == config.LOCAL_DOMAIN or \ login['username'].upper() == "{}$".format(login['computer'].split('.')[0]).upper() or \ login['logonid'] == config.CONSTANT_NA \ ): discard = True log.debug("Discarding event") # workstation/screensaver locks elif args.include_locks is False and login[ 'type'] == config.LOGON_TYPE_UNLOCK: discard = True log.debug("Discarding event") else: aux = utils.get_logout_event(args.index, login['logonid'], login['timestamp'], maxdate, args.include_locks) logout = utils.build_event_from_source(aux) log.debug("Got logoff event for login id {}".format(login['id'])) if logout['timestamp'] > 0: aux = logout['timestamp'] - login['timestamp'] try: duration = str(timedelta(microseconds=aux * 10**3)) except: duration = '-' event = utils.build_logon_sequence(duration, login, logout) if logout is not None: log.debug("Logon sequence complete") if args.mark_if_logged_at is not None: event['mark.description'] = "Logged on at {}".format( args.mark_if_logged_at) if login['timestamp'] > whowasat or ( logout['timestamp'] > 0 and logout['timestamp'] < whowasat): event['mark.value'] = False else: event['mark.value'] = True discard = False if args.who_was_at is not None and ( login['timestamp'] > whowasat or (logout['timestamp'] > 0 and logout['timestamp'] < whowasat)): discard = True log.debug("Discarding event") if discard is False: count += 1 if csv is not None: csv.add_sequence(event) if neo is not None: neo.add_sequence(event, args.neo4j_full_info, args.unique_logon_rels) log.debug("Event stored") progress += 1 proglen = utils.draw_progress_bar( float((progress * 100 / total) / 100.0), begin, proglen) # timeframe if total > 0 and csv is not None and args.disable_timeframe is False: frame = dict(config.EVENT_STRUCT) for k in frame.keys(): frame[k] = "-" * 10 frame[config.CSV_FIELDS[0]] = "TIMEFRAME/END" frame['logon.datetime'] = args.max_date frame['logon.timestamp'] = maxdate frame['logoff.datetime'] = args.max_date frame['logoff.timestamp'] = maxdate csv.add_sequence(frame) total = timedelta(microseconds=int((time.time() - begin) * 10**6)) print("") log.info("{} Logons processed in {}".format(count, total)) if neo is not None: neo.finish() return
def get_es_query(self): # Just using 'terms' would not work, as it would return any tag match # in the list, but we want to exactly match all of them. return [Q('term', tags=tag) for tag in self.get_value()]
def query_parser(qstr=None): """Default parser that uses the Q() from elasticsearch_dsl.""" if qstr: return Q('query_string', query=qstr) return Q()
def get_object_list(self, request): user = request.user query_text = request.GET.get('query', None) if not user.is_authenticated: result_dict = simple_search_public_data(query_text) return [SearchObject(id=1, hits=result_dict)] groups = user.groups.all() index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) query_exp = query_exp & query_exp_oacl ms = ms.add( Search(index='experiments').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) ms = ms.add( Search(index='dataset').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset).query( 'nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \ Q("term", experiments__public_access=100) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", experiments__objectacls__entityId=group.id) query_datafile = query_datafile & query_datafile_oacl ms = ms.add( Search(index='datafile').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_datafile)) results = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return [SearchObject(id=1, hits=result_dict)]
def search(request): q = request.params.get("q", '') if q: should = [] for field in SEARCH_FIELDS: kw = {"query": q} if field in SEARCH_BOOSTS: kw["boost"] = SEARCH_BOOSTS[field] should.append(Q("match", **{field: kw})) # Add a prefix query if ``q`` is longer than one character. if len(q) > 1: should.append(Q('prefix', normalized_name=q)) query = request.es.query("dis_max", queries=should) query = query.suggest("name_suggestion", q, term={"field": "name"}) else: query = request.es.query() if request.params.get("o"): query = query.sort(request.params["o"]) if request.params.getall("c"): query = query.filter("terms", classifiers=request.params.getall("c")) try: page_num = int(request.params.get("page", 1)) except ValueError: raise HTTPBadRequest("'page' must be an integer.") page = ElasticsearchPage( query, page=page_num, url_maker=paginate_url_factory(request), ) if page.page_count and page_num > page.page_count: return HTTPNotFound() available_filters = collections.defaultdict(list) classifiers_q = (request.db.query(Classifier).with_entities( Classifier.classifier).filter( exists([release_classifiers.c.trove_id]).where( release_classifiers.c.trove_id == Classifier.id)).order_by( Classifier.classifier)) for cls in classifiers_q: first, *_ = cls.classifier.split(' :: ') available_filters[first].append(cls.classifier) def filter_key(item): try: return 0, SEARCH_FILTER_ORDER.index(item[0]), item[0] except ValueError: return 1, 0, item[0] return { "page": page, "term": q, "order": request.params.get("o", ''), "available_filters": sorted(available_filters.items(), key=filter_key), "applied_filters": request.params.getall("c"), }
def filter_queryset(self, qs): qs = super().filter_queryset(qs) qs = qs.query(query.Bool(filter=[Q('term', is_recommended=True)])) return qs.query('function_score', functions=[query.SF('random_score')]).sort('_score')
def resource_accessess(resource, userid=0): s = Search().extra(size=0) if userid != 0: s = s.query( 'bool', must=[ Q("range", datetime={ 'gte': 'now-7d', 'lte': 'now-1d' }), Q("match", component="resources"), Q('bool', should=[ Q('match', action='access'), Q('match', action='view') ]), Q('match', **{'context__' + resource._my_subclass + '_id': resource.id}), Q('match', user_id=userid) ]) else: s = s.query( 'bool', must=[ Q("range", datetime={ 'gte': 'now-7d', 'lte': 'now-1d' }), Q("match", component="resources"), Q('bool', should=[ Q('match', action='access'), Q('match', action='view') ]), Q('match', **{'context__' + resource._my_subclass + '_id': resource.id}) ]) return s
def apply_ao_specific_query_params(query, **kwargs): must_clauses = [] if kwargs.get('ao_no'): must_clauses.append(Q('terms', no=kwargs.get('ao_no'))) if kwargs.get('ao_name'): must_clauses.append(Q('match', name=' '.join(kwargs.get('ao_name')))) if kwargs.get('ao_is_pending') is not None: must_clauses.append(Q('term', is_pending=kwargs.get('ao_is_pending'))) if kwargs.get('ao_status'): must_clauses.append(Q('match', status=kwargs.get('ao_status'))) if kwargs.get('ao_requestor'): must_clauses.append(Q('match', requestor_names=kwargs.get('ao_requestor'))) citation_queries = [] if kwargs.get('ao_regulatory_citation'): for citation in kwargs.get('ao_regulatory_citation'): exact_match = re.match(r"(?P<title>\d+)\s+C\.?F\.?R\.?\s+§*\s*(?P<part>\d+)\.(?P<section>\d+)", citation) if(exact_match): citation_queries.append(Q("nested", path="regulatory_citations", query=Q("bool", must=[Q("term", regulatory_citations__title=int(exact_match.group('title'))), Q("term", regulatory_citations__part=int(exact_match.group('part'))), Q("term", regulatory_citations__section=int(exact_match.group('section')))]))) if kwargs.get('ao_statutory_citation'): for citation in kwargs.get('ao_statutory_citation'): exact_match = re.match(r"(?P<title>\d+)\s+U\.?S\.?C\.?\s+§*\s*(?P<section>\d+).*\.?)", citation) if(exact_match): citation_queries.append(Q("nested", path="statutory_citations", query=Q("bool", must=[Q("term", statutory_citations__title=int(exact_match.group('title'))), Q("term", statutory_citations__section=int(exact_match.group('section')))]))) if kwargs.get('ao_citation_require_all'): must_clauses.append(Q('bool', must=citation_queries)) else: must_clauses.append(Q('bool', should=citation_queries, minimum_should_match=1)) if kwargs.get('ao_requestor_type'): requestor_types = {1: 'Federal candidate/candidate committee/officeholder', 2: 'Publicly funded candidates/committees', 3: 'Party committee, national', 4: 'Party committee, state or local', 5: 'Nonconnected political committee', 6: 'Separate segregated fund', 7: 'Labor Organization', 8: 'Trade Association', 9: 'Membership Organization, Cooperative, Corporation W/O Capital Stock', 10: 'Corporation (including LLCs electing corporate status)', 11: 'Partnership (including LLCs electing partnership status)', 12: 'Governmental entity', 13: 'Research/Public Interest/Educational Institution', 14: 'Law Firm', 15: 'Individual', 16: 'Other'} must_clauses.append(Q("terms", requestor_types=[requestor_types[r] for r in kwargs.get('ao_requestor_type')])) date_range = {} if kwargs.get('ao_min_issue_date'): date_range['gte'] = kwargs.get('ao_min_issue_date') if kwargs.get('ao_max_issue_date'): date_range['lte'] = kwargs.get('ao_max_issue_date') if date_range: must_clauses.append(Q("range", issue_date=date_range)) date_range = {} if kwargs.get('ao_min_request_date'): date_range['gte'] = kwargs.get('ao_min_request_date') if kwargs.get('ao_max_request_date'): date_range['lte'] = kwargs.get('ao_max_request_date') if date_range: must_clauses.append(Q("range", request_date=date_range)) if kwargs.get('ao_entity_name'): must_clauses.append(Q('bool', should=[Q('match', commenter_names=' '.join(kwargs.get('ao_entity_name'))), Q('match', representative_names=' '.join(kwargs.get('ao_entity_name')))], minimum_should_match=1)) query = query.query('bool', must=must_clauses) return query
def test_complex_example(): s = search.Search() s = s.query('match', title='python') \ .query(~Q('match', title='ruby')) \ .filter(F('term', category='meetup') | F('term', category='conference')) \ .post_filter('terms', tags=['prague', 'czech']) \ .script_fields(more_attendees="doc['attendees'].value + 42") s.aggs.bucket('per_country', 'terms', field='country')\ .metric('avg_attendees', 'avg', field='attendees') s.query.minimum_should_match = 2 s = s.highlight_options(order='score').highlight('title', 'body', fragment_size=50) assert { 'query': { 'filtered': { 'filter': { 'bool': { 'should': [{ 'term': { 'category': 'meetup' } }, { 'term': { 'category': 'conference' } }] } }, 'query': { 'bool': { 'must': [{ 'match': { 'title': 'python' } }], 'must_not': [{ 'match': { 'title': 'ruby' } }], 'minimum_should_match': 2 } } } }, 'post_filter': { 'terms': { 'tags': ['prague', 'czech'] } }, 'aggs': { 'per_country': { 'terms': { 'field': 'country' }, 'aggs': { 'avg_attendees': { 'avg': { 'field': 'attendees' } } } } }, "highlight": { 'order': 'score', 'fields': { 'title': { 'fragment_size': 50 }, 'body': { 'fragment_size': 50 } } }, 'script_fields': { 'more_attendees': { 'script': "doc['attendees'].value + 42" } } } == s.to_dict()
def __call__(self, search, _): return search.query( Q("bool", must=[Q("terms", references=self.annotation_ids)]))
def get(cls, id, kind): s = cls.search() s.query = Q('bool', must=[Q('term', id=id), Q('term', kind=kind)]) rs = s.execute() if rs: return rs.hits[0]
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.context.get_doctype(), ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=('_results_number cannot be greater ' 'than 1,000')) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative') elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000') for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, six.string_types) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = operator_wildcards[ param.operator] % param.value query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations(params, search, facets_size, histogram_intervals) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError as e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: exc_type, exc_value, exc_tb = sys.exc_info() # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error)[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass # Re-raise the original exception with the correct traceback six.reraise(exc_type, exc_value, exc_tb) if shards and shards.failed: # Some shards failed. We want to explain what happened in the # results, so the client can decide what to do. failed_indices = defaultdict(int) for failure in shards.failures: failed_indices[failure.index] += 1 for index, shards_count in failed_indices.items(): errors.append({ 'type': 'shards', 'index': index, 'shards_count': shards_count, }) return { 'hits': hits, 'total': total, 'facets': aggregations, 'errors': errors, }
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Apply term filters. Each tuple pairs a filter's parameter name in the API # with its corresponding field in Elasticsearch. "None" means that the # names are identical. filters = [ ('extension', None), ('categories', None), ('aspect_ratio', None), ('size', None), ('source', 'provider'), ('license', 'license__keyword'), ('license_type', 'license__keyword') ] for tup in filters: api_field, elasticsearch_field = tup s = _apply_filter(s, search_params, api_field, elasticsearch_field) # Get suggestions for any route s = s.suggest( 'get_suggestion', '', term={'field': 'creator'} ) # Exclude mature content unless explicitly enabled by the requester if not search_params.data['mature']: s = s.exclude('term', mature=True) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set( key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers ) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query( 'simple_query_string', query=query, fields=search_fields ) # Get suggestions for term query s = s.suggest( 'get_suggestion', query, term={'field': 'creator'} ) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query( 'simple_query_string', query=creator, fields=['creator'] ) # Get suggestions for creator s = s.suggest( 'get_suggestion', creator, term={'field': 'creator'} ) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query( 'simple_query_string', query=title, fields=['title'] ) # Get suggestions for title s = s.suggest( 'get_suggestion', title, term={'field': 'title'} ) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query( 'simple_query_string', fields=['tags.name'], query=tags ) # Get suggestions for tags s = s.suggest( 'get_suggestion', tags, term={'field': 'tags.name'} ) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q( 'rank_feature', field=factor, boost=boost_factor ) queries.append(rank_feature_query) s = Search().query( Q( 'bool', must=s.query, should=queries, minimum_should_match=1 ) ) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip), request_timeout=7) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}') except RequestError as e: raise ValueError(e) results = _post_process_results( s, start, end, page_size, search_response, request, filter_dead ) suggestion = _query_suggestions(search_response) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count, suggestion
def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75, max_search_results=1000): # This function can easily be memcached now es_client = Elasticsearch() es_sort_keys = { 'id': 'id', 'size': 'filesize', # 'name': 'display_name', # This is slow and buggy 'comments': 'comment_count', 'seeders': 'seed_count', 'leechers': 'leech_count', 'downloads': 'download_count' } sort_ = sort.lower() if sort_ not in es_sort_keys: flask.abort(400) es_sort = es_sort_keys[sort] order_keys = { 'desc': 'desc', 'asc': 'asc' } order_ = order.lower() if order_ not in order_keys: flask.abort(400) # Only allow ID, desc if RSS if rss: sort = es_sort_keys['id'] order = 'desc' # funky, es sort is default asc, prefixed by '-' if desc if 'desc' == order: es_sort = '-' + es_sort # Quality filter quality_keys = [ '0', # Show all '1', # No remakes '2', # Only trusted '3' # Only completed ] if quality_filter.lower() not in quality_keys: flask.abort(400) quality_filter = int(quality_filter) # Category filter main_category = None sub_category = None main_cat_id = 0 sub_cat_id = 0 if category: cat_match = re.match(r'^(\d+)_(\d+)$', category) if not cat_match: flask.abort(400) main_cat_id = int(cat_match.group(1)) sub_cat_id = int(cat_match.group(2)) if main_cat_id > 0: if sub_cat_id > 0: sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id) if not sub_category: flask.abort(400) else: main_category = models.MainCategory.by_id(main_cat_id) if not main_category: flask.abort(400) # This might be useless since we validate users # before coming into this method, but just to be safe... if user: user = models.User.by_id(user) if not user: flask.abort(404) user = user.id same_user = False if logged_in_user: same_user = user == logged_in_user.id s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix # Apply search term if term: s = s.query('simple_query_string', analyzer='my_search_analyzer', default_operator="AND", query=term) # User view (/user/username) if user: s = s.filter('term', uploader_id=user) if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in user is not the same as the user being viewed, # show only torrents that aren't hidden or anonymous. # # If logged in user is the same as the user being viewed, # show all torrents including hidden and anonymous ones. # # On RSS pages in user view, show only torrents that # aren't hidden or anonymous no matter what if not same_user or rss: s = s.filter('term', hidden=False) s = s.filter('term', anonymous=False) # General view (homepage, general search view) else: if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in, show all torrents that aren't hidden unless they belong to you # On RSS pages, show all public torrents and nothing more. if logged_in_user and not rss: hiddenFilter = Q('term', hidden=False) userFilter = Q('term', uploader_id=logged_in_user.id) combinedFilter = hiddenFilter | userFilter s = s.filter('bool', filter=[combinedFilter]) else: s = s.filter('term', hidden=False) if main_category: s = s.filter('term', main_category_id=main_cat_id) elif sub_category: s = s.filter('term', main_category_id=main_cat_id) s = s.filter('term', sub_category_id=sub_cat_id) if quality_filter == 0: pass elif quality_filter == 1: s = s.filter('term', remake=False) elif quality_filter == 2: s = s.filter('term', trusted=True) elif quality_filter == 3: s = s.filter('term', complete=True) # Apply sort s = s.sort(es_sort) # Only show first RESULTS_PER_PAGE items for RSS if rss: s = s[0:per_page] else: max_page = min(page, int(math.ceil(max_search_results / float(per_page)))) from_idx = (max_page - 1) * per_page to_idx = min(max_search_results, max_page * per_page) s = s[from_idx:to_idx] highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT') if highlight: s = s.highlight_options(tags_schema='styled') s = s.highlight("display_name") # Return query, uncomment print line to debug query # from pprint import pprint # print(json.dumps(s.to_dict())) return s.execute()
"params": {'param2'}, "tags": {"tag3"} } assert actual_col_to_whitelist_dict == col_to_whitelist_dict @pytest.mark.parametrize("test_parsed_filter,test_query,test_type", [ ({ 'type': 'parameter', 'key': 'param0', 'comparator': 'LIKE', 'value': '%va%' }, Q('bool', filter=[ Q("term", params__key="param0"), Q("wildcard", params__value="*va*") ]), "params"), ({ 'type': 'parameter', 'key': 'param0', 'comparator': 'ILIKE', 'value': '%va%' }, Q('bool', filter=[ Q("term", params__key="param0"), Q("wildcard", params__value="*va*") ]), "params"), ({ 'type': 'parameter',
class PhaseConfig: filter_fields = ('docclass', 'status', 'unit', 'discipline', 'document_type', 'under_review', 'overdue', 'leader', 'approver') indexable_fields = ['is_existing', 'can_be_transmitted'] es_field_types = { 'overdue': 'boolean', } column_fields = ( ('', 'under_preparation_by'), ('Document Number', 'document_number'), ('Title', 'title'), ('Rev.', 'current_revision'), ('Status', 'status'), ('Class', 'docclass'), ('Unit', 'unit'), ('Discipline', 'discipline'), ('Document type', 'document_type'), ('Review start date', 'review_start_date'), ('Review due date', 'review_due_date'), ('Under review', 'under_review'), ('Overdue', 'overdue'), ('Leader', 'leader'), ('Approver', 'approver'), ('Final revision', 'final_revision'), ) transmittal_columns = { 'Document Number': 'document_key', 'Title': 'title', 'Contract Number': 'contract_number', 'Originator': 'originator', 'Unit': 'unit', 'Discipline': 'discipline', 'Document Type': 'document_type', 'Sequential Number': 'sequential_number', 'Class': 'docclass', 'Revision': 'revision', 'Status': 'status', 'Received Date': 'received_date', 'Created': 'created_on', } export_fields = OrderedDict(( ('Document number', 'document_number'), ('Title', 'title'), ('Revision', 'revision_name'), ('Revision date', 'revision_date'), ('Status', 'status'), ('Doc category', 'category'), ('Class', 'docclass'), ('Contract Number', 'contract_number'), ('Originator', 'originator'), ('Unit', 'unit'), ('Discipline', 'discipline'), ('Document type', 'document_type'), ('Sequential number', 'sequential_number'), ('System', 'system'), ('WBS', 'wbs'), ('Weight', 'weight'), ('Is final revision', 'final_revision'), ('Received date', 'received_date'), ('Created on', 'created_on'), ('Review start date', 'review_start_date'), ('Review due date', 'review_due_date'), ('Leader', 'leader'), ('Approver', 'approver'), ('Outgoing transmittal', 'transmittal'), ('Sent date', 'transmittal_sent_date'), ('Purpose of issue', 'purpose_of_issue'), ('External due date', 'external_review_due_date'), ('Return code', 'return_code'), ('STD Planned', 'status_std_planned_date'), ('IDC Planned', 'status_idc_planned_date'), ('IFR Planned', 'status_ifr_planned_date'), ('IFA Planned', 'status_ifa_planned_date'), ('IFD Planned', 'status_ifd_planned_date'), ('IFC Planned', 'status_ifc_planned_date'), ('IFI Planned', 'status_ifi_planned_date'), ('ASB Planned', 'status_asb_planned_date'), ('STD Forecast', 'status_std_forecast_date'), ('IDC Forecast', 'status_idc_forecast_date'), ('IFR Forecast', 'status_ifr_forecast_date'), ('IFA Forecast', 'status_ifa_forecast_date'), ('IFD Forecast', 'status_ifd_forecast_date'), ('IFC Forecast', 'status_ifc_forecast_date'), ('IFI Forecast', 'status_ifi_forecast_date'), ('ASB Forecast', 'status_asb_forecast_date'), ('STD Actual', 'status_std_actual_date'), ('IFR Actual', 'status_ifr_actual_date'), ('IDC Actual', 'status_idc_actual_date'), ('IFA Actual', 'status_ifa_actual_date'), ('IFD Actual', 'status_ifd_actual_date'), ('IFC Actual', 'status_ifc_actual_date'), ('IFI Actual', 'status_ifi_actual_date'), ('ASB Actual', 'status_asb_actual_date'), )) custom_filters = OrderedDict((('show_cld_spd', { 'field': forms.BooleanField, 'label': _('Show CLD/SPD docs'), 'filters': { True: None, False: Q('term', is_existing=True), None: Q('term', is_existing=True) } }), ('outgoing_trs', { 'field': forms.BooleanField, 'label': _('Ready for outgoing TRS'), 'filters': { True: Q('term', can_be_transmitted=True), False: None, None: None, } })))
def inner(values): terms = current_custom_metadata.terms available_terms = current_custom_metadata.available_vocabulary_set conditions = [] for value in values: # Matches this: # [vocabulary:term]:[value] parsed = re.match( r'^\[(?P<key>[-\w]+\:[-\w]+)\]\:\[(?P<val>.+)\]$', value) if not parsed: raise RESTValidationError(errors=[ FieldError( field, 'The parameter should have the format: ' 'custom=[term]:[value].') ]) parsed = parsed.groupdict() search_key = parsed['key'] search_value = parsed['val'] if search_key not in available_terms: raise RESTValidationError(errors=[ FieldError( field, u'The "{}" term is not supported.'.format( search_key)) ]) custom_fields_mapping = dict( keyword='custom_keywords', text='custom_text', relationship='custom_relationships', ) term_type = terms[search_key]['type'] es_field = custom_fields_mapping[term_type] nested_clauses = [ { 'term': { '{}.key'.format(es_field): search_key } }, ] if term_type in ('text', 'keyword'): nested_clauses.append({ 'query_string': { 'fields': ['{}.value'.format(es_field)], 'query': search_value, } }) elif term_type == 'relationship': if ':' not in search_value: raise RESTValidationError(errors=[ FieldError(field, ( 'Relatinship terms serach values should ' 'follow the format "<sub>:<obj>".')) ]) sub, obj = search_value.split(':', 1) if sub: nested_clauses.append({ 'query_string': { 'fields': [es_field + '.subject'], 'query': sub } }) if obj: nested_clauses.append({ 'query_string': { 'fields': [es_field + '.object'], 'query': obj } }) conditions.append({ 'nested': { 'path': es_field, 'query': { 'bool': { 'must': nested_clauses } }, } }) return Q('bool', must=conditions)
def search(self): """Get Elasticsearch search instance.""" s = self._search_cls(index=self._index) if self._query: s.query(Q('query_string', query=self._query)) return s
def TermsMatch(key, value): return Q('terms', **{key: value})
def obj_create(self, bundle, **kwargs): user = bundle.request.user groups = user.groups.all() # if anonymous user search public data only query_text = bundle.data.get("text", None) type_tag = bundle.data.get("TypeTag", []) index_list = [] for type in type_tag: if type == 'Experiment': index_list.append('experiments') elif type == 'Dataset': index_list.append('dataset') elif type == 'Datafile': index_list.append('datafile') end_date = bundle.data.get("EndDate", None) start_date = bundle.data.get("StartDate", None) if end_date is not None: end_date_utc = datetime.datetime.strptime(end_date, "%Y-%m-%dT%H:%M:%S.%fZ") \ .replace(tzinfo=pytz.timezone('UTC')) end_date = end_date_utc.astimezone(LOCAL_TZ).date() else: # set end date to today's date end_date = datetime.datetime.today().replace(tzinfo=pytz.timezone('UTC')) if start_date: start_date_utc = datetime.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ") \ .replace(tzinfo=pytz.timezone('UTC')) start_date = start_date_utc.astimezone(LOCAL_TZ).date() instrument_list = bundle.data.get("InstrumentList", None) instrument_list_id = [] if instrument_list: for ins in instrument_list: instrument_list_id.append(Instrument.objects.get(name__exact=ins).id) # query for experiment model ms = MultiSearch(index=index_list) if 'experiments' in index_list: query_exp = Q("match", title=query_text) if user.is_authenticated: query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) else: query_exp_oacl = Q("term", public_access=100) if start_date is not None: query_exp = query_exp & Q("range", created_time={'gte': start_date, 'lte': end_date}) query_exp = query_exp & query_exp_oacl ms = ms.add(Search(index='experiments') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_exp)) if 'dataset' in index_list: query_dataset = Q("match", description=query_text) if user.is_authenticated: query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) else: query_dataset_oacl = Q("term", **{'experiments.public_access': 100}) if start_date is not None: query_dataset = query_dataset & Q("range", created_time={'gte': start_date, 'lte': end_date}) if instrument_list: query_dataset = query_dataset & Q("terms", **{'instrument.id': instrument_list_id}) # add instrument query ms = ms.add(Search(index='dataset') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset) .query('nested', path='experiments', query=query_dataset_oacl)) if 'datafile' in index_list: query_datafile = Q("match", filename=query_text) if user.is_authenticated: query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \ Q("term", experiments__public_access=100) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", experiments__objectacls__entityId=group.id) else: query_datafile_oacl = Q("term", experiments__public_access=100) if start_date is not None: query_datafile = query_datafile & Q("range", created_time={'gte': start_date, 'lte': end_date}) query_datafile = query_datafile & query_datafile_oacl ms = ms.add(Search(index='datafile') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_datafile)) result = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in result: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) if bundle.request.method == 'POST': bundle.obj = SearchObject(id=1, hits=result_dict) return bundle
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue field_name = self.get_field_name(value) sig_bucket.bucket( value, 'terms', field=field_name, size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break