def requestquery(page=0): """ The Function for querying our database """ # page is assumed to be 0 indexed here query = request.query_string auth, permitted = validate_user(mode="read") try: # default values default = {'size': 25, 'page': page, 'version': 'true'} settings = parser.make_settings(permitted, default) elasticq = parser.parse(query, settings) except PErr.QueryError as e: logging.exception(e) raise eh.KarpQueryError('Parse error', debug_msg=e.message, query=query) except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except Exception as e: # catch *all* exceptions logging.exception(e) raise eh.KarpQueryError("Could not parse data", debug_msg=e, query=query) mode = settings['mode'] sort = sortorder(settings, mode, settings.get('query_command', '')) start = settings['start'] if 'start' in settings\ else settings['page'] * settings['size'] # size = min(settings['size'], setupconf.max_page) size = settings['size'] index, typ = configM.get_mode_index(mode) exclude = configM.searchfield(mode, 'secret_fields') if not auth else [] ans = parser.adapt_query( size, start, configM.elastic(mode=mode), loads(elasticq), { 'size': size, 'sort': sort, 'from_': start, 'index': index, '_source_exclude': exclude, 'version': settings['version'], 'search_type': 'dfs_query_then_fetch' }) if settings.get('highlight', False): clean_highlight(ans) if settings.get('format') or settings.get('export'): formatmethod = 'format' if 'format' in settings else 'export' toformat = settings.get(formatmethod) msg = 'Unkown %s %s for mode %s' % (formatmethod, toformat, mode) format_posts = configM.extra_src(mode, formatmethod, helpers.notdefined(msg)) format_posts(ans, configM.elastic(mode=mode), mode, index, toformat) return ans
def publish_group(group, suffix): # TODO for some reason, this sometimes removes siblings # of the group from an mode. Eg. publish_group(saldo) # may lead to 'saldogroup' not containing 'external'. es = configM.elastic(group) if not configM.searchconfig.get(group)['is_index']: for subgroup in configM.searchconfig.get(group)['groups']: publish_group(subgroup, suffix) else: name = make_indexname(group, suffix) print "Publish %s as %s" % (name, group) add_actions = [] rem_actions = [] for parent in make_parents(group): add_actions.append('{"add" : {"index": "%s", "alias": "%s"}}' % (name, parent)) rem_actions.append('{"remove": {"index":"%s_*", "alias":"%s"}}' % (group, parent)) print 'remove', rem_actions print 'add', add_actions try: print 'remove old aliases' es.indices.update_aliases('{"actions" : [%s]}' % ','.join(rem_actions)) except Exception: print 'No previous aliased indices, could not do remove any' print rem_actions return es.indices.update_aliases('{"actions" : [%s]}' % ','.join(add_actions))
def create_mode(alias, suffix, with_id=False): es = configM.elastic(alias) if configM.searchconfig[alias]['is_index']: to_create = [alias] else: to_create = configM.searchconfig.get(alias)['groups'] typ = configM.searchconfig[alias]['type'] for index in to_create: data = get_mapping(index) newname = make_indexname(index, suffix) try: ans = es.indices.create(index=newname, body=data) except esExceptions.TransportError as e: print e raise Exception('Index already exists') print ans try: lexicons = configM.get_lexiconlist(index) load(lexicons, newname, typ, es, with_id=with_id) except Exception as e: # delete the index if things did not go well ans = es.indices.delete(newname) print 'Any documentes uploaded to ES index %s are removed.' % newname print 'If data was uploaded to SQL you will have to \ remove it manually.' raise
def add_lexicon(to_add_name, to_add_file, alias, suffix): es = configM.elastic(alias) data = get_mapping(alias) indexname = make_indexname(alias, suffix) typ = configM.searchconfig[alias]['type'] try: ans = es.indices.create(index=indexname, body=data) print ans except Exception: print 'Could not create index. Check if it needs manual removal' raise try: inpdata = open(to_add_file, 'r').read() sql = configM.get_mode_sql(alias) upload('json', to_add_name, '', inpdata, es, indexname, typ, sql=sql) # do this last to get the avoid version conflicts (in case some user is # updating the index during this process) reindex_alias(alias, suffix, create_index=False) except Exception: # delete the index if things did not go well ans = es.indices.delete(indexname) print ans print 'Any documentes uploaded to ES index %s are removed.' % indexname print 'If data was uploaded to SQL you will have to \ remove it manually.' raise
def internalize_lexicon(mode, to_add): """ Reads all entries from the lexicons specified in to_add and puts them in sql. They may then be edited. The ES index is not effected and this operation may hence be run at any time """ ok = 0 es = configM.elastic(mode) for lex in to_add: print 'Internalize', lex # Go through each lexicon separately query = {"query": {"term": {"lexiconName": lex}}} # scan and scroll ans = helpers.scan(es, query=query, scroll=u'3m', raise_on_error=True, preserve_order=False, index=mode) sql_bulk = [] for hit in ans: # ans is an iterator of objects from in hits.hits _id = hit.get('_id') source = hit.get('_source') if isinstance(source, dict): source = json.dumps(source) sql_bulk.append((_id, source, 'admin', 'entry automatically added or reloaded', lex, 'imported')) db_loaded, db_error = db.update_bulk(lex, sql_bulk) if db_error: raise Exception(db_error) ok += db_loaded if not ok: raise Exception("No data") print >> sys.stderr, "Warning. 0 documents uploaded\n" print "Ok. %s documents loaded to sql\n" % ok
def create_sequence_index(index_name='', start=''): es = configM.elastic(mode=index_name) sequence_index = Index("sequence", using=es) if sequence_index.exists(): logging.debug('sequence id %s already exists' % index_name) else: logging.debug('create sequence id %s starting at %s' % (index_name, start or 0)) sequence_index.settings( number_of_shards=1, number_of_replicas=0 ) sequence_index.create() m = Mapping("sequence") m.meta("_all", enabled=False) m.meta("_source", enabled=False) m.save("sequence", using=es) if start: tasks = ('{"index": {"_index": "sequence", "_type": "sequence", "_id": "%s", "version": "%s", "version_type": "external"}}\n{}\n' % (index_name, start)) result = es.bulk(body=tasks) logging.debug('sequence id starting at %s: %s' % (start, result)) return result
def recover_add(index, suffix, lexicon): # TODO test this """ Recovers the data to ES, uses SQL as the trusted base version. Find the last version of every SQL entry and send this to ES. Adds the specified lexicons to an existing index """ import src.server.translator.bulkify as bulk es = configM.elastic(index) print 'Save %s to %s' % (lexicon, index) to_keep = {} for lex in lexicon: engine, db_entry = db.get_engine(lex, echo=False) for entry in db.dbselect(lex, engine=engine, db_entry=db_entry, max_hits=-1): _id = entry['id'] if _id: # don't add things without id, they are errors if _id in to_keep: last = to_keep[_id]['date'] if last < entry['date']: to_keep[_id] = entry else: to_keep[_id] = entry print len(to_keep), 'entries to keep' data = bulk.bulkify_sql(to_keep, bulk_info={'index': index}) ok, err = helpers.bulk(es, data) if err: msg = "Error during upload. %s documents successfully uploaded. \ Message: %s.\n" raise Exception(msg % (ok, '\n'.join(err))) print 'recovery done'
def reindex_help(alias, source_index, target_index, create_index=True): print 'Reindex from %s to %s' % (source_index, target_index) es = configM.elastic(alias) if create_index: print 'create %s' % target_index data = get_mapping(alias) ans = es.indices.create(index=target_index, body=data) print 'Created index', ans # TODO when elasticsearch is updated to >=2.3: use es.reindex instead ans = helpers.reindex(es, source_index, target_index) print ans
def querycount(page=0): # TODO error if buckets is used here # TODO validate_user is also done once in requestquery # but since we need the permitted dict, it is called # here as well auth, permitted = validate_user(mode="read") try: # TODO buckets should be gathered from some config default = { "buckets": ['lexiconOrder', 'lexiconName'], "size": configM.setupconfig['MAX_PAGE'] } settings = parser.make_settings(permitted, default) q_ans = requestquery(page=page) # TODO does search_type=count work with the new es version? # if not, use query_then_fetch, size=0 # raise the size for the statistics call stat_size = configM.setupconfig['MAX_PAGE'] count_elasticq, more = parser.statistics( request.query_string, settings, order={"lexiconOrder": ("_term", "asc")}, show_missing=False, force_size=stat_size) mode = settings['mode'] es = configM.elastic(mode=mode) index, typ = configM.get_mode_index(mode) count_ans = es.search( index=index, body=loads(count_elasticq), search_type="count", # raise the size for the statistics call size=stat_size) distribution = count_ans['aggregations']['q_statistics'][ 'lexiconOrder']['buckets'] except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except (elasticsearch.RequestError, elasticsearch.TransportError) as e: logging.exception(e) raise eh.KarpElasticSearchError( "ElasticSearch failure. Message: %s.\n" % e) except Exception as e: # catch *all* exceptions # Remember that 'buckets' is not allowed here! %s" logging.exception(e) raise eh.KarpQueryError("Could not parse data", debug_msg=e, query=request.query_string) return jsonify({'query': q_ans, 'distribution': distribution})
def statlist(): # TODO add is_more here (as above) """ Returns the counts and stats for the query """ auth, permitted = validate_user(mode="read") try: query = request.query_string mode = parser.get_mode(query) logging.debug('mode is %s' % mode) default = { "buckets": configM.searchfield(mode, 'statistics_buckets'), "size": 100, "cardinality": False } settings = parser.make_settings(permitted, default) exclude = [] if auth else configM.searchfield(mode, 'secret_fields') elasticq, more = parser.statistics(query, settings, exclude=exclude, prefix='STAT_') es = configM.elastic(mode=settings['mode']) is_more = check_bucketsize(more, settings["size"], mode, es) # TODO allow more than 100 000 hits here? size = settings['size'] index, typ = configM.get_mode_index(settings['mode']) ans = es.search(index=index, body=loads(elasticq), search_type="count", size=size) tables = [] for key, val in ans['aggregations']['q_statistics'].items(): if key.startswith('STAT_'): tables.extend(generate_table(val, [])) # the length of tables might be longer than size, so truncate it # generating shorter tables is not faster than generating all of it # and then truncating if size: tables = tables[:size] return jsonify({"stat_table": tables, "is_more": is_more}) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except Exception as e: # catch *all* exceptions # raise logging.exception(e) raise eh.KarpGeneralError("Unknown error", debug_msg=e, query=query)
def minientry(): """ Returns the counts and stats for the query """ max_page = configM.setupconfig['MINIENTRY_PAGE'] auth, permitted = validate_user(mode="read") try: query = request.query_string mode = parser.get_mode(query) default = { 'show': configM.searchfield(mode, 'minientry_fields'), 'size': 25 } settings = parser.make_settings(permitted, default) elasticq = parser.parse(query, settings) show = settings['show'] if not auth: # show = show - exclude exclude = configM.searchfield(mode, 'secret_fields') show = list(set(show).difference(exclude)) sort = sortorder(settings, mode, settings.get('query_command', '')) start = settings['start'] if 'start' in settings else 0 es = configM.elastic(mode=settings['mode']) index, typ = configM.get_mode_index(settings['mode']) ans = parser.adapt_query( settings['size'], start, es, loads(elasticq), { 'index': index, '_source': show, 'from_': start, 'sort': sort, 'size': min(settings['size'], max_page), 'search_type': 'dfs_query_then_fetch' }) if settings.get('highlight', False): clean_highlight(ans) return jsonify(ans) except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except PErr.QueryError as e: raise eh.KarpQueryError("Parse error, %s" % e.message, debug_msg=e, query=query) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except Exception as e: # catch *all* exceptions logging.exception(e) raise eh.KarpGeneralError("Unknown error", debug_msg=e, query=query)
def get_update_index(lexicon, suggestion=False): index = '' try: if suggestion: index, typ = configM.get_lexicon_suggindex(lexicon) else: index, typ = configM.get_lexicon_index(lexicon) return configM.elastic(lexicon=lexicon), index, typ except Exception as e: logging.exception(e) msg = "No writable mode for lexicon %s was found" % lexicon raise eh.KarpElasticSearchError(msg, debug_msg=msg + ", index: " + index)
def recover(alias, suffix, lexicon, create_new=True): # TODO test this """ Recovers the data to ES, uses SQL as the trusted base version. Find the last version of every SQL entry and send this to ES. """ import src.server.translator.bulkify as bulk to_keep = {} # if not lexicon: # lexicon = conf.keys() mapping = get_mapping(alias) index = make_indexname(alias, suffix) typ = configM.get_mode_type(alias) print 'Save %s to %s' % (lexicon or 'all', index) es = configM.elastic(alias) if create_new: # Create the index ans = es.indices.create(index=index, body=mapping) print ans for lex in lexicon: engine, db_entry = db.get_engine(lex, echo=False) for entry in db.dbselect(lex, engine=engine, db_entry=db_entry, max_hits=-1): _id = entry['id'] if _id: # don't add things without id, they are errors if _id in to_keep: last = to_keep[_id]['date'] if last < entry['date']: to_keep[_id] = entry else: to_keep[_id] = entry print len(to_keep), 'entries to keep' data = bulk.bulkify_sql(to_keep, bulk_info={'index': index, 'type': typ}) try: ok, err = helpers.bulk(es, data) except: print data if err: msg = "Error during upload. %s documents successfully uploaded. \ Message: %s.\n" raise Exception(msg % (ok, '\n'.join(err))) print 'recovery done'
def explain(): query = request.query_string auth, permitted = validate_user(mode="read") try: # default settings = parser.make_settings(permitted, {'size': 25, 'page': 0}) elasticq = parser.parse(query, settings) except PErr.QueryError as e: raise eh.KarpQueryError("Parse error", debug_msg=e, query=query) es = configM.elastic(mode=settings['mode']) index, typ = configM.get_mode_index(settings['mode']) ex_ans = es.indices.validate_query(index=index, body=loads(elasticq), explain=True) q_ans = requestquery(page=0) return jsonify({ 'elastic_json_query': loads(elasticq), 'ans': q_ans, 'explain': ex_ans })
def savesuggestion(lexicon, _id, status='accepted', source=''): from dbhandler.dbhandler import dbselect sugg_index, typ = configM.get_lexicon_suggindex(lexicon) es = configM.elastic(lexicon=lexicon) suggestion = es.get(index=sugg_index, id=_id) auth, permitted = validate_user() set_lexicon = suggestion["_source"]["lexiconName"] helpers.check_lexiconName(lexicon, set_lexicon, 'rejectsuggestion', _id) if lexicon not in permitted: raise eh.KarpAuthenticationError('You are not allowed to update lexicon %s' % lexicon) origin = dbselect(lexicon, suggestion=True, _id=_id, max_hits=1)[0] origid = origin['origid'] request.get_data() data = loads(request.data) message = data.get('message') suggestion['message'] = message suggestion['version'] = origin['version'] if not source: source = suggestion # the user log in is checked in add_doc # add_doc raises exception if ES if origid: # update in ES ans = update.update_doc(lexicon, origid, data=source, live=False) else: # add to ES ans = update.add_doc(lexicon, live=False, data=source) origid = ans.get('_id') # mark as accepted ok, err = update.modify_db(_id, lexicon, message, status, origid=origid) # delete from suggestion index suggans = update.delete_entry(lexicon, _id, sql=False, live=False, suggestion=True) ans['sugg_db_loaded'] = ok ans['sugg_es_ans'] = suggans if not ok: logging.debug(err) update.send_notification(origin['user'], message, _id, status) return ans
def random(): auth, permitted = validate_user(mode="read") try: query = request.query_string mode = parser.get_mode(query) default = { "show": configM.searchfield(mode, 'minientry_fields'), "size": 1 } settings = parser.make_settings(permitted, default) elasticq = parser.random(query, settings) es = configM.elastic(mode=mode) index, typ = configM.get_mode_index(mode) es_q = { 'index': index, 'body': loads(elasticq), 'size': settings['size'] } if settings['show']: show = settings['show'] if not auth: # show = show - exclude exclude = configM.searchfield(mode, 'secret_fields') show = list(set(show).difference(exclude)) es_q['_source'] = show ans = es.search(**es_q) return jsonify(ans) except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except Exception as e: # catch *all* exceptions logging.exception(e) raise eh.KarpGeneralError("Unknown error", debug_msg=e, query=query)
def formatpost(): """ Formats the posted data into wanted format The data should be a list Currently only working for saol """ # get and parse data request.get_data() data = request.data try: data = loads(data) except ValueError as e: raise eh.KarpParsingError(str(e)) # set all allowed lexicons (to avoid authentication exception auth, permitted = validate_user(mode="read") # find the wanted format settings = parser.make_settings(permitted, {'size': 25}) query = request.query_string parsed = parser.parse_qs(query) parser.parse_extra(parsed, settings) to_format = settings.get('format', '') mode = parser.get_mode(query) logging.debug('mode "%s"' % mode) index, typ = configM.get_mode_index(mode) if to_format: if type(data) != list: data = [data] errmsg = 'Unkown format %s for mode %s' % (settings['format'], mode) format_list = configM.extra_src(mode, 'format_list', helpers.notdefined(errmsg)) ok, html = format_list(data, configM.elastic(mode=mode), settings['format'], index) return jsonify({'all': len(data), 'ok': ok, 'data': html}) else: raise eh.KarpQueryError('Unkown format %s' % to_format)
def statistics(): """ Returns the counts and stats for the query """ auth, permitted = validate_user(mode="read") try: query = request.query_string mode = parser.get_mode(query) default = { "buckets": configM.searchfield(mode, 'statistics_buckets'), "size": 100, "cardinality": False } settings = parser.make_settings(permitted, default) exclude = [] if auth else configM.searchfield(mode, 'secret_fields') elasticq, more = parser.statistics(query, settings, exclude=exclude) es = configM.elastic(mode=settings['mode']) is_more = check_bucketsize(more, settings, mode, es) index, typ = configM.get_mode_index(settings['mode']) # TODO allow more than 100 000 hits here? ans = es.search(index=index, body=loads(elasticq), search_type="count", size=settings['size']) ans["is_more"] = is_more return jsonify(ans) except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except Exception as e: # catch *all* exceptions logging.exception(e) raise eh.KarpGeneralError("Unknown error", debug_msg=e, query=query)
def reset_sequence(index_name): es = configM.elastic(mode=index_name) try: es.delete(index="sequence", doc_type="sequence", id=index_name) except elasticsearch.exceptions.NotFoundError: pass
def update_doc(lexicon, _id, data=None, live=True): """ Updates a posted document in the index 'index' with type 'typ'. The document must contain a field called 'doc' with the information to be sent. The fields 'version' and 'message' are optional. """ # send user name and password, # {'doc' : es_doc, 'version' : last version, 'message' : update message} authdict, permitted = auth.validate_user() if data is None: data = helpers.read_data() try: index, typ = configM.get_lexicon_index(lexicon) es = configM.elastic(lexicon=lexicon) origin = es.get(index=index, id=_id) except Exception as e: logging.warning("Looking for entry at the wrong place:") logging.exception(e) msg = "The entry %s in lexicon %s was not found" % (_id, lexicon) raise eh.KarpElasticSearchError(msg, debug_msg=msg + " in index " + index) lexiconName = origin['_source']['lexiconName'] helpers.check_lexiconName(lexicon, lexiconName, _id, 'update') data_doc = data.get('doc') or data.get('_source') version = data.get('version') msg = data["message"] if lexicon not in permitted: raise eh.KarpAuthenticationError('You are not allowed to modify the ' 'lexicon %s, only %s' % (lexicon, permitted), status_code=403) # TODO validate data_doc, but this is so far sb specific! validate.validate_json(data_doc, lexicon) date = datetime.datetime.now() user = helpers.get_user() auto_update_document(data_doc, lexicon, 'update', user, date) try: if version is not None and version != -1: ans = es.index(index=index, doc_type=typ, id=_id, version=version, body=data_doc, op_type='index') else: ans = es.index(index=index, doc_type=typ, id=_id, body=data_doc, op_type='index') except (esExceptions.RequestError, esExceptions.TransportError) as e: # Transport error might be version conflict logging.exception(e) logging.debug('index: %s, type: %s, id: %s' % (index, typ, _id)) handle_update_error(e, {"id": _id, "data": data}, user, 'update') raise eh.KarpElasticSearchError("Error during update. Message: %s.\n" % str(e)) except Exception as e: handle_update_error(e, {"id": _id, "data": data}, user, 'update') raise eh.KarpElasticSearchError("Unexpected error during update.") db_loaded, db_error = update_db(_id, data_doc, user, msg, lexiconName, status='changed', date=date) jsonans = {'es_loaded': 1, 'sql_loaded': db_loaded, 'es_ans': ans} if db_error: logging.debug(db_error) if live: return jsonify(jsonans) else: return jsonans
def get_id_sequence(index_name, size): tasks = "".join(['{"index": {"_index": "sequence", "_type": "sequence", "_id": "' + index_name + '"}}\n{}\n' for _ in range(0, size)]) es = configM.elastic(mode=index_name) result = es.bulk(body=tasks) for item in result['items']: yield item["index"]["_version"]
def autocomplete(): """ Returns lemgrams matching the query text. Each mode specifies in the configs which fields that should be considered. The parameter 'q' or 'query' is used when only one word form is to be processed. The parameter 'multi' is used when multiple word forms should be processed. The format of result depends on which flag that is set. """ auth, permitted = validate_user(mode="read") query = request.query_string try: settings = parser.make_settings(permitted, {'size': 1000}) parsed = parser.parse_qs(query) mode = parser.get_mode(query) p_extra = parser.parse_extra(parsed, settings) qs = parsed.get('q', []) or parsed.get('query', []) multi = False if not qs: # check if there are multiple words forms to complete qs = settings.get('multi', []) logging.debug('qs %s' % qs) multi = True # use utf8, escape '"' qs = [re.sub('"', '\\"', q.decode('utf8')) for q in qs] headboost = configM.searchfield(mode, 'boosts')[0] res = {} ans = {} # if multi is not true, only one iteration of this loop will be done for q in qs: boost = '''"functions": [{"boost_factor" : "500", "filter":{"term":{"%s":"%s"}}}]''' % (headboost, q) autocompleteq = configM.extra_src(mode, 'autocomplete', autocompletequery) exp = autocompleteq(mode, boost, q) autocomplete_field = configM.searchonefield( mode, 'autocomplete_field') fields = ['"exists": {"field" : "%s"}' % autocomplete_field] # last argument is the 'fields' used for highlightning # TODO use filter? elasticq = parser.search([exp] + p_extra, fields, '', usefilter=True) es = configM.elastic(mode=mode) logging.debug('_source: %s' % autocomplete_field) logging.debug(elasticq) index, typ = configM.get_mode_index(mode) ans = parser.adapt_query( settings['size'], 0, es, loads(elasticq), { 'size': settings['size'], 'index': index, '_source': autocomplete_field }) # save the results for multi res[q] = ans if multi: return jsonify(res) else: # single querys: only return the latest answer return jsonify(ans) except PErr.AuthenticationError as e: logging.exception(e) msg = e.message raise eh.KarpAuthenticationError(msg) except eh.KarpException as e: # pass on karp exceptions logging.exception(e) raise except Exception as e: # catch *all* exceptions logging.exception(e) raise eh.KarpGeneralError("Unknown error", debug_msg=e, query=query)
def get_context(lexicon): """ Find and return the alphabetically (or similar, as specified for the lexicon) context of a word/entry. """ auth, permitted = validate_user(mode="read") if lexicon not in permitted: raise eh.KarpAuthenticationError('You are not allowed to search the ' 'lexicon %s' % lexicon) # make default settings settings = parser.make_settings(permitted, { "size": 10, "resource": lexicon }) # parse querystring query = request.query_string parsed = parser.parse_qs(query) # parse parameter settings parser.parse_extra(parsed, settings) # set searching configurations mode = configM.get_lexicon_mode(lexicon) settings['mode'] = mode es = configM.elastic(mode=mode) index, typ = configM.get_mode_index(mode) # get the sort_by list (eg. ['baseform.sort', 'lemmaid.search']) # leave out lexiconOrder and _score sortfieldnames = [ field for field in configM.searchconf(mode, 'sort_by') if field not in ['_score', 'lexiconOrder'] ] # get the sort field paths (eg. ['FormRep.baseform.raw', 'lemmaid.raw']) # Used for sorting. sortfield = sum([F.lookup_multiple(f, mode) for f in sortfieldnames], []) # get the field name of the head sort field. Used for searching sortfieldname = sortfieldnames[0] # find the center entry (by its id) if 'center' in settings: center_id = settings['center'] lexstart = es.search(index=index, doc_type=typ, size=1, body={"query": { "term": { "_id": center_id } }}, sort=['%s:asc' % f for f in sortfield]) # if no center id is given, pick the first entry of the lexicon else: exps = [] parser.parse_ext('and|resource|equals|%s' % lexicon, exps, [], mode) center_q = parser.search(exps, [], [], usefilter=True) lexstart = es.search(index=index, doc_type=typ, size=1, body=center_q, sort=['%s:asc' % f for f in sortfield]) center_id = lexstart['hits']['hits'][0]['_id'] # lexstart = es.search(index=index, doc_type=typ, size=1, # sort=['%s:asc' % f for f in sortfield]) # center_id = lexstart['hits']['hits'][0]['_id'] if not lexstart['hits']['hits']: logging.error('No center found %s, %s' % (center_id, lexstart)) raise eh.KarpElasticSearchError("Could not find entry %s" % center_id) centerentry = lexstart['hits']['hits'][0] logging.debug('center %s, %s' % (centerentry, centerentry['_id'])) origentry_sort = [key for key in centerentry['sort'] if key is not None][0] # TODO what to do if the sort key is not in the lexicon? as below? # origentry_sort = centerentry['sort'][0] sortvalue = control_escape(origentry_sort) sortvalue = control_escape(origentry_sort) logging.debug(u'Orig entry escaped key %s' % sortvalue) # Construct queries to es exps = [] querystring = settings.get('q', '').decode( 'utf8') # the query string from the user parser.parse_ext('and|resource|equals|%s' % lexicon, exps, [], mode) if querystring: if querystring.startswith('simple'): querystring = 'and|anything|equals|%s' % querystring.split('|')[-1] else: querystring = re.sub('extended\|\|', '', querystring) parser.parse_ext(querystring, exps, [], mode) preexps = copy.deepcopy(exps) # deep copy for the pre-query parser.parse_ext('and|%s|gte|%s' % (sortfieldname, sortvalue), exps, [], mode) elasticq_post = parser.search(exps, [], [], usefilter=True) parser.parse_ext('and|%s|lte|%s' % (sortfieldname, sortvalue), preexps, [], mode) elasticq_pre = parser.search(preexps, [], [], usefilter=True) # +1 to compensate for the word itself being in the context size = settings['size'] + 1 show = configM.searchfield(mode, 'minientry_fields') # TODO size*3 (magic number) because many entries may have the same sort # value (eg homographs in saol) ans_pre = parser.adapt_query( size * 3, 0, es, elasticq_pre, { 'size': size * 3, 'from_': 0, 'sort': ['%s:desc' % f for f in sortfield], 'index': index, '_source': show, 'search_type': 'dfs_query_then_fetch' }) ans_post = parser.adapt_query( size * 3, 0, es, elasticq_post, { 'size': size * 3, 'from_': 0, 'sort': ['%s:asc' % f for f in sortfield], 'index': index, '_source': show, 'search_type': 'dfs_query_then_fetch' }) hits_pre = ans_pre.get('hits', {}).get('hits', []) hits_post = ans_post.get('hits', {}).get('hits', []) hits_pre = go_to_sortkey(hits_pre, origentry_sort, center_id) hits_post = go_to_sortkey(hits_post, origentry_sort, center_id) return jsonify({ "pre": hits_pre[:settings['size']], "post": hits_post[:settings['size']], "center": centerentry })
source_suffix = sys.argv[3] target_suffix = sys.argv[4] reindex(index, source_suffix, target_suffix) elif sys.argv[1] == '--printlatestversion': printlatestversion(sys.argv[2], debug=False) elif sys.argv[1] == '--exportlatestversion': printlatestversion(sys.argv[2], debug=False, with_id=True) elif sys.argv[1] == '--deleteindex': try: index = sys.argv[2] group = sys.argv[3] print 'Delete %s (belonging to %s)' % (index, group) es = configM.elastic(group) ans = es.indices.delete(index=index) print ans except esExceptions.TransportError as e: print 'Index %s was not present, can not be deleted' % index elif sys.argv[1] == '--getmapping': import offline.getmapping as gm alias = sys.argv[2] outfile = sys.argv[3] gm.getmapping(alias, outfile) elif sys.argv[1] == '--create_mode': # Creates every index (suffixed with 'suffix') # which are used in mode 'mode' # python upload_offline.py --create_mode karp 170119