def api_tag_list(request, user, params): """ Get list of available tags for API user (via auth_token) """ dataset_id = params['dataset'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) tag_set = mass_helper.get_unique_tags() tag_frequency = mass_helper.get_tag_frequency(tag_set) tag_models = set([tagger.description for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value)]) data = [] for tag in sorted(tag_frequency.keys()): count = tag_frequency[tag] has_model = tag in tag_models doc = {'description': tag, 'count': count, 'has_model': has_model} data.append(doc) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def get_all_rows(es_params, request): buffer_ = StringIO() writer = csv.writer(buffer_) writer.writerow([feature for feature in es_params['features']]) ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) es_m.set_query_parameter('size', ES_SCROLL_BATCH) features = sorted(es_params['features']) response = es_m.scroll() scroll_id = response['_scroll_id'] hits = response['hits']['hits'] while hits: process_hits(hits, features, write=True, writer=writer) buffer_.seek(0) data = buffer_.read() buffer_.seek(0) buffer_.truncate() yield data response = es_m.scroll(scroll_id=scroll_id) hits = response['hits']['hits'] scroll_id = response['_scroll_id']
def get_all_rows(es_params, request): features = es_params['features'] # Prepare in-memory csv writer. buffer_ = StringIO() writer = csv.writer(buffer_) # Write the first headers. writer.writerow(features) ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) es_m.set_query_parameter('size', ES_SCROLL_BATCH) # Fetch the initial scroll results. response = es_m.scroll() scroll_id = response['_scroll_id'] hits = response['hits']['hits'] while hits: process_hits(hits, features, write=True, writer=writer) # Return some data with the StreamingResponce yield _get_buffer_data(buffer_) # Continue with the scroll. response = es_m.scroll(scroll_id=scroll_id) hits = response['hits']['hits'] scroll_id = response['_scroll_id']
def api_search_list(request, user, params): """ Get list of available searches for API user (via auth_token) """ # Read all params dataset_id = int(params['dataset']) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') # Build response structure data = [] dataset = Dataset(pk=dataset_id) search_list = list(Search.objects.filter(dataset=dataset)) for search in search_list: row = { 'dataset': dataset_id, 'search': search.id, 'description': search.description } data.append(row) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def mlt_query(request): logger = LogManager(__name__, 'SEARCH MLT') es_params = request.POST mlt_fields = [json.loads(field)['path'] for field in es_params.getlist('mlt_fields')] handle_negatives = request.POST['handle_negatives'] docs_accepted = [a.strip() for a in request.POST['docs'].split('\n') if a] docs_rejected = [a.strip() for a in request.POST['docs_rejected'].split('\n') if a] # stopwords stopword_lexicon_ids = request.POST.getlist('mlt_stopword_lexicons') stopwords = [] for lexicon_id in stopword_lexicon_ids: lexicon = Lexicon.objects.get(id=int(lexicon_id)) words = Word.objects.filter(lexicon=lexicon) stopwords+=[word.wrd for word in words] ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) response = es_m.more_like_this_search(mlt_fields,docs_accepted=docs_accepted,docs_rejected=docs_rejected,handle_negatives=handle_negatives,stopwords=stopwords) documents = [] for hit in response['hits']['hits']: fields_content = get_fields_content(hit,mlt_fields) documents.append({'id':hit['_id'],'content':fields_content}) template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'documents':documents} template = loader.get_template('mlt_results.html') return HttpResponse(template.render(template_params, request))
def run(self, task_id): self.task_id = task_id task = Task.objects.get(pk=self.task_id) params = json.loads(task.parameters) task.update_status(Task.STATUS_RUNNING) try: ds = Datasets().activate_datasets_by_id(params['dataset']) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(self._parse_query(params)) self.es_m = es_m self.params = params valid, msg = self._check_if_request_bad(self.params) if valid: self._preprocessor_worker() else: raise UserWarning(msg) except TaskCanceledException as e: # If here, task was canceled while processing # Delete task task = Task.objects.get(pk=self.task_id) task.delete() logging.getLogger(INFO_LOGGER).info(json.dumps({'process': 'PROCESSOR WORK', 'event': 'processor_worker_canceled', 'data': {'task_id': self.task_id}}), exc_info=True) print("--- Task canceled") except Exception as e: logging.getLogger(ERROR_LOGGER).exception(json.dumps( {'process': 'PROCESSOR WORK', 'event': 'processor_worker_failed', 'data': {'task_id': self.task_id}}), exc_info=True) # declare the job as failed. task = Task.objects.get(pk=self.task_id) task.result = json.dumps({'error': repr(e)}) task.update_status(Task.STATUS_FAILED, set_time_completed=True)
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = json.dumps([request.POST[x] for x in request.POST.keys() if 'match_txt' in x]) search = Search(author=request.user,search_content=s_content,description=desc,dataset=Dataset.objects.get(pk=int(request.session['dataset'])),query=json.dumps(q)) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def index(request): ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter(task_type='train_model').filter(status__iexact='completed').order_by('-pk') preprocessors = collect_map_entries(preprocessor_map) enabled_preprocessors = [preprocessor for preprocessor in preprocessors] # Hide fact graph if no facts_str_val is present in fields display_fact_graph = 'hidden' for i in fields: if json.loads(i['data'])['type'] == "fact_str_val": display_fact_graph = '' break template_params = {'display_fact_graph': display_fact_graph, 'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'lexicons': Lexicon.objects.all().filter(author=request.user), 'dataset': ds.get_index(), 'language_models': language_models, 'allowed_datasets': datasets, 'enabled_preprocessors': enabled_preprocessors, 'task_params': task_params} template = loader.get_template('searcher.html') return HttpResponse(template.render(template_params, request))
def index(request): template = loader.get_template('mwe_miner.html') lexicons = [] runs = [] for run in Run.objects.all().filter(user=request.user).order_by('-pk'): try: groups = json.loads(run.results).values() num_mwes = sum(len(group['mwes']) for group in groups) setattr(run,'num_groups',len(groups)) setattr(run,'num_mwes',num_mwes) # setattr(run,'committed',len({approved_candidate.candidate for approved_candidate in approved_term_candidates} & {committed_candidate.term for committed_candidate in Term.objects.filter(author=request.user)})) except ValueError as e: print('Exception', e) pass runs.append(run) for lexicon in Lexicon.objects.all().filter(author=request.user): setattr(lexicon,'size',Word.objects.all().filter(lexicon=lexicon.id).count()) lexicons.append(lexicon) # Define selected mapping ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) fields = es_m.get_column_names() datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter(task_type='train_model').filter(status__iexact='completed').order_by('-pk') return HttpResponse(template.render({'lexicons':lexicons,'STATIC_URL':STATIC_URL,'runs':runs,'fields':fields, 'language_models': language_models, 'allowed_datasets': datasets},request))
def api_mass_train_tagger(request, user, params): """ Apply mass train tagger (via auth_token) """ # Read all params dataset_id = params.get('dataset', None) selected_tags = set(params.get('tags', [])) field = params.get("field", None) normalizer_opt = params.get("normalizer_opt", "0") classifier_opt = params.get("classifier_opt", "0") reductor_opt = params.get("reductor_opt", "0") extractor_opt = params.get("extractor_opt", "0") retrain_only = params.get("retrain_only", False) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) data = mass_helper.schedule_tasks(selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def export_matched_data(request): search_id = request.GET['search_id'] inclusive_metaquery = json.loads(request.GET['inclusive_grammar']) ds = Datasets().activate_dataset(request.session) component_query = ElasticGrammarQuery(inclusive_metaquery, None).generate() es_m = ds.build_manager(ES_Manager) if search_id == '-1': # Full search es_m.combined_query = component_query else: saved_query = json.loads(Search.objects.get(pk=search_id).query) es_m.load_combined_query(saved_query) es_m.merge_combined_query_with_query_dict(component_query) inclusive_instructions = generate_instructions(inclusive_metaquery) response = StreamingHttpResponse(get_all_matched_rows(es_m.combined_query['main'], request, inclusive_instructions), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % ('extracted.csv') return response
def index(request): ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk') # Hide fact graph if no facts_str_val is present in fields display_fact_graph = 'hidden' for i in fields: if json.loads(i['data'])['type'] == "fact_str_val": display_fact_graph = '' break template_params = {'display_fact_graph': display_fact_graph, 'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'lexicons': Lexicon.objects.all().filter(author=request.user), 'language_models': language_models, 'allowed_datasets': datasets, } template = loader.get_template('searcher.html') return HttpResponse(template.render(template_params, request))
def index(request): template = loader.get_template('mwe_miner.html') lexicons = [] runs = [] for run in Run.objects.all().filter(user=request.user).order_by('-pk'): try: groups = json.loads(run.results).values() num_mwes = sum(len(group['mwes']) for group in groups) setattr(run,'num_groups',len(groups)) setattr(run,'num_mwes',num_mwes) # setattr(run,'committed',len({approved_candidate.candidate for approved_candidate in approved_term_candidates} & {committed_candidate.term for committed_candidate in Term.objects.filter(author=request.user)})) except ValueError as e: logging.getLogger(ERROR_LOGGER).exception(e) pass runs.append(run) for lexicon in Lexicon.objects.all().filter(author=request.user): setattr(lexicon,'size',Word.objects.all().filter(lexicon=lexicon.id).count()) lexicons.append(lexicon) # Define selected mapping ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) fields = es_m.get_column_names() datasets = Datasets().get_allowed_datasets(request.user) language_models =Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk') return HttpResponse(template.render({'lexicons':lexicons,'STATIC_URL':STATIC_URL,'runs':runs,'fields':fields, 'language_models': language_models, 'allowed_datasets': datasets},request))
def __init__(self, request): self.es_params = request.POST self.ds = Datasets().activate_dataset(request.session) self.index = self.ds.get_index() self.mapping = self.ds.get_mapping() self.es_m = ES_Manager(self.index, self.mapping) self.field = 'texta_facts'
def export_matched_data(request): search_id = request.GET['search_id'] inclusive_metaquery = json.loads(request.GET['inclusive_grammar']) ds = Datasets().activate_dataset(request.session) component_query = ElasticGrammarQuery(inclusive_metaquery, None).generate() es_m = ds.build_manager(ES_Manager) if search_id == '-1': # Full search es_m.combined_query = component_query else: saved_query = json.loads(Search.objects.get(pk=search_id).query) es_m.load_combined_query(saved_query) es_m.merge_combined_query_with_query_dict(component_query) inclusive_instructions = generate_instructions(inclusive_metaquery) response = StreamingHttpResponse(get_all_matched_rows( es_m.combined_query['main'], request, inclusive_instructions), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % ( 'extracted.csv') return response
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: log_dict = { 'task': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed' } logging.getLogger(ERROR_LOGGER).error("Documents queried failed", extra=log_dict, exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = { 'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0 } logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def save_grammar(request): grammar_dict = json.loads(request.POST['json']) grammar_id = grammar_dict[0]['id'] if grammar_id == 'new': name = grammar_dict[0]['text'] ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammar = Grammar(name=name, json='', author=request.user, dataset=Dataset.objects.filter(index=dataset, mapping=mapping)[0]) grammar.save() grammar_dict[0]['id'] = grammar.id else: grammar = Grammar.objects.get(id=grammar_id) grammar.json = json.dumps(grammar_dict) grammar.save() return HttpResponse(json.dumps({'id': grammar.id}))
def run(self, task_id): self.task_id = task_id self.task_obj = Task.objects.get(pk=self.task_id) params = json.loads(self.task_obj.parameters) self.task_obj.update_status(Task.STATUS_RUNNING) try: ds = Datasets().activate_datasets_by_id(params['dataset']) es_m = ds.build_manager(ES_Manager) # es_m.load_combined_query(self._parse_query(params)) self.es_m = es_m self.params = params result = self._start_subworker() self.task_obj.result = result self.task_obj.update_status(Task.STATUS_COMPLETED, set_time_completed=True) except TaskCanceledException as e: # If here, task was canceled while processing # Delete task self.task_obj.delete() logging.getLogger(INFO_LOGGER).info(json.dumps({'process': 'PROCESSOR WORK', 'event': 'management_worker_canceled', 'data': {'task_id': self.task_id}})) print("--- Task canceled") except Exception as e: logging.getLogger(ERROR_LOGGER).exception(json.dumps( {'process': 'PROCESSOR WORK', 'event': 'manager_worker_failed', 'data': {'task_id': self.task_id}}), exc_info=True) # declare the job as failed. self.task_obj.result = json.dumps({'error': repr(e)}) self.task_obj.update_status(Task.STATUS_FAILED, set_time_completed=True) print('Done with management task')
def update(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST if 'model' in parameters: model = str(parameters['model']) request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('dataset_updated') if 'dataset' in parameters: # TODO: check if is a valid mapping_id before change session[dataset] new_dataset = parameters['dataset'] if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset)): request.session['dataset'] = new_dataset logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_dataset', new_dataset) logger.info('dataset_updated') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) return HttpResponseRedirect(URL_PREFIX + '/')
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size": 10, "highlight": { "fields": { field: {} } }, "query": { "match": { field: value } } }) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def api_document_tags_list(request, user, params): """ Get document tags (via auth_token) """ dataset_id = params.get('dataset', None) document_ids = params.get('document_ids', None) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) resp = mass_helper.get_document_by_ids(document_ids) data = [] for doc in resp['hits']['hits']: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG': doc_id = doc['_id'] doc_path = f['doc_path'] doc_tag = f['str_val'] data.append({ 'document_id': doc_id, 'field': doc_path, 'tag': doc_tag}) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def dashboard_visualize(request): es_params = request.POST ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) indices = es_params.get("chosen_index", None).split(',') for i in range(len(indices)): indices[i] = indices[i].replace('.', '-').replace("*", "WILDCARD") color_setting = request.POST['dashboard-color'] color_max = request.POST['dashboard-color-maximum'] color_min = request.POST['dashboard-color-minimum'] template = loader.get_template('dashboard/dashboard.html') return HttpResponse( template.render( { 'STATIC_URL': STATIC_URL, 'color_setting': color_setting, 'color_max': color_max, 'color_min': color_min, 'URL_PREFIX': URL_PREFIX, 'indices': indices }, request))
def index(request): ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') # Hide fact graph if no facts_str_val is present in fields display_fact_graph = 'hidden' for i in fields: if json.loads(i['data'])['type'] == "fact_str_val": display_fact_graph = '' break template_params = { 'display_fact_graph': display_fact_graph, 'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'lexicons': Lexicon.objects.all().filter(author=request.user), 'language_models': language_models, 'allowed_datasets': datasets, } template = loader.get_template('searcher.html') return HttpResponse(template.render(template_params, request))
def get_query(request): es_params = request.POST ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) # GET ONLY MAIN QUERY query = es_m.combined_query['main'] return HttpResponse(json.dumps(query))
def __init__(self, params): self.field = json.loads(params['field'])['path'] query = json.loads(Search.objects.get(pk=int(params['search'])).query) # Define selected mapping ds = Datasets().activate_dataset_by_id(params['dataset']) self.es_m = ds.build_manager(ES_Manager) self.es_m.load_combined_query(query)
def selectLexicon(request): try: template = loader.get_template('lexicon.html') lexicon = Lexicon.objects.get(id=request.GET['id']) words = Word.objects.filter(lexicon=lexicon) words = [a.wrd for a in words] lexicons = Lexicon.objects.filter(author=request.user) datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status=Task.STATUS_COMPLETED).order_by('-pk') # Define selected mapping ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) fields = es_m.get_column_names() log_dict = { 'task': 'CREATE LEXICON', 'event': 'lexicon_selected', 'arguments': { 'user_name': request.user.username, 'lexicon_id': request.GET['id'] }, 'data': { 'lexicon_terms': words } } logging.getLogger(INFO_LOGGER).info("Lexicon selected", extra=log_dict) return HttpResponse( template.render( { 'words': words, 'selected': request.GET['id'], 'selected_name': lexicon, 'lexicons': lexicons, 'STATIC_URL': STATIC_URL, 'features': fields, 'language_models': language_models, 'allowed_datasets': datasets }, request)) except Exception as e: log_dict = { 'task': 'CREATE LEXICON', 'event': 'lexicon_selection_failed', 'arguments': { 'user_name': request.user.username, 'lexicon_id': request.GET['id'] } } logging.getLogger(ERROR_LOGGER).error("Lexicon selection failed", extra=log_dict, exc_info=True) return HttpResponseRedirect(URL_PREFIX + '/lexicon_miner')
def get_grammar_listing(request): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() grammars = Grammar.objects.filter(author=request.user, dataset__index=dataset, dataset__mapping=mapping).order_by('-last_modified') grammar_json = json.dumps([{'id':grammar.id, 'name':grammar.name, 'last_modified':grammar.last_modified.strftime("%d/%m/%y %H:%M:%S")} for grammar in grammars]) return HttpResponse(grammar_json)
def parse_request(self, request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() ds = Datasets().activate_datasets(request.session) self.es_m = ds.build_manager(ES_Manager) self.user = request.user
def remove_by_query(request): es_params = request.POST ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) # Process(target=remove_worker,args=(es_m,'notimetothink')).start() response = remove_worker(es_m, 'notimetothink') return HttpResponse(response)
def parse_request(self,request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() ds = Datasets().activate_datasets(request.session) self.es_m = ds.build_manager(ES_Manager) self.user = request.user
def get_example_texts(request, field, value): ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) query = { "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}} response = es_m.perform_query(query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def __init__(self, parameters, callback_progress=None): ds = Datasets().activate_dataset_by_id(parameters['dataset']) query = self._parse_query(parameters) self.field = json.loads(parameters['field'])['path'] self.es_m = ds.build_manager(ES_Manager) self.es_m.load_combined_query(query) self.callback_progress = callback_progress if self.callback_progress: total_elements = self.get_total_documents() callback_progress.set_total(total_elements)
def parse_request(self,request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def parse_request(self, request): self.lookup_types = request.POST['lookup_types'].split(',') self.key_constraints = request.POST['key_constraints'].split(',') self.content = request.POST['content'].split('\n')[-1].strip() print(self.content) ds = Datasets().activate_dataset(request.session) self.dataset = ds.get_index() self.mapping = ds.get_mapping() self.es_m = ES_Manager(self.dataset, self.mapping) self.user = request.user
def index(request): ds = Datasets().activate_datasets(request.session) datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk') es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) preprocessors = collect_map_entries(preprocessor_map) enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True] enabled_preprocessors = sorted(enabled_preprocessors, key=itemgetter('name'), reverse=False) tasks = [] for task in Task.objects.all().order_by('-pk'): task_dict = task.__dict__ task_dict['user'] = task.user task_dict['parameters'] = translate_parameters(task_dict['parameters']) if task_dict['result']: task_dict['result'] = json.loads(task_dict['result']) tasks.append(task_dict) if 'dataset' in request.session.keys(): get_fact_names(es_m) tag_set = fact_names if fact_names else [] context = { 'task_params': task_params, 'tasks': tasks, 'task_statuses': Task.STATUS_DICT, 'language_models': language_models, 'allowed_datasets': datasets, 'searches': Search.objects.filter(datasets__in=[Dataset.objects.get(pk=ads.id).id for ads in ds.active_datasets]).distinct(), 'enabled_preprocessors': enabled_preprocessors, 'STATIC_URL': STATIC_URL, 'fields': fields, 'text_tags': tag_set, 'lexicons': Lexicon.objects.all(), } else: messages.warning(request, "No dataset selected, please select a dataset before using Task Manager!") return HttpResponseRedirect('/') pipe_builder = get_pipeline_builder() context['train_tagger_extractor_opt_list'] = pipe_builder.get_extractor_options() context['train_tagger_reductor_opt_list'] = pipe_builder.get_reductor_options() context['train_tagger_normalizer_opt_list'] = pipe_builder.get_normalizer_options() context['train_tagger_classifier_opt_list'] = pipe_builder.get_classifier_options() template = loader.get_template('task_manager.html') return HttpResponse(template.render(context, request))
def table_header_mlt(request): ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) # get columns names from ES mapping fields = es_m.get_column_names(facts=True) template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'columns': [{'index': index, 'name': field_name} for index, field_name in enumerate(fields)], } template = loader.get_template('mlt_results.html') return HttpResponse(template.render(template_params, request))
def get_example_texts(request, field, value): ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() query = json.dumps({ "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}}) response = ES_Manager.plain_scroll(es_url, dataset, mapping, query) matched_sentences = [] for hit in response['hits']['hits']: for match in hit['highlight'].values(): matched_sentences.append(match[0]) return matched_sentences
def get_table_header(request): ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) # get columns names from ES mapping fields = es_m.get_column_names() template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'fields': fields, 'searches': Search.objects.filter(author=request.user), 'columns': [{'index':index, 'name':field_name} for index, field_name in enumerate(fields)], 'dataset': ds.get_index(), 'mapping': ds.get_mapping()} template = loader.get_template('searcher_results.html') return HttpResponse(template.render(template_params, request))
def cluster_query(request): params = request.POST ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(params) cluster_m = ClusterManager(es_m,params) clustering_data = convert_clustering_data(cluster_m, params) template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'clusters': clustering_data} template = loader.get_template('cluster_results.html') return HttpResponse(template.render(template_params, request))
def apply(self, task_id): self.task_id = task_id task = Task.objects.get(pk=self.task_id) params = json.loads(task.parameters) ds = Datasets().activate_dataset_by_id(params['dataset']) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(self._parse_query(params)) self.es_m = es_m self.params = params # Process(target=self._preprocessor_worker()).start() self._preprocessor_worker() # Apache wsgi problem with multiprocessing return True
def get_all_matched_rows(query, request, inclusive_instructions): buffer_ = StringIO() writer = csv.writer(buffer_) ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) features = sorted([field['path'] for field in es_m.get_mapped_fields()]) query['size'] = ES_SCROLL_BATCH writer.writerow(features) ds.get_index() ds.get_mapping() es_url request_url = os.path.join(es_url, ds.get_index(), ds.get_mapping(), '_search?scroll=1m') response = requests.get(request_url, data=json.dumps(query)).json() scroll_id = response['_scroll_id'] hits = response['hits']['hits'] scroll_payload = json.dumps({'scroll':'1m', 'scroll_id':scroll_id}) while hits: for hit in hits: feature_dict = {feature_name:hit['_source'][feature_name] for feature_name in hit['_source']} feature_dict = {} row = [] for feature_name in features: feature_path = feature_name.split('.') parent_source = hit['_source'] for path_component in feature_path: if path_component in parent_source: parent_source = parent_source[path_component] else: parent_source = "" break content = parent_source row.append(content) feature_dict[feature_name] = content layer_dict = matcher.LayerDict(feature_dict) if inclusive_instructions.match(layer_dict): writer.writerow([element.encode('utf-8') if isinstance(element,unicode) else element for element in row]) buffer_.seek(0) data = buffer_.read() buffer_.seek(0) buffer_.truncate() yield data response = requests.get(os.path.join(es_url,'_search','scroll'), data=scroll_payload).json() hits = response['hits']['hits'] scroll_id = response['_scroll_id']
def run(self, task_id): self.task_id = task_id self.task_obj = Task.objects.get(pk=self.task_id) params = json.loads(self.task_obj.parameters) self.task_obj.update_status(Task.STATUS_RUNNING) try: ds = Datasets().activate_datasets_by_id(params['dataset']) es_m = ds.build_manager(ES_Manager) # es_m.load_combined_query(self._parse_query(params)) self.es_m = es_m self.params = params result = self._start_subworker() self.task_obj.result = result self.task_obj.update_status(Task.STATUS_COMPLETED, set_time_completed=True) except TaskCanceledException as e: # If here, task was canceled while processing # Delete task self.task_obj.delete() log_dict = { 'task': 'PROCESSOR WORK', 'event': 'management_worker_canceled', 'data': { 'task_id': self.task_id } } self.info_logger.info("Management worker canceled", extra=log_dict) print("--- Task canceled") except Exception as e: log_dict = { 'task': 'PROCESSOR WORK', 'event': 'manager_worker_failed', 'data': { 'task_id': self.task_id } } self.error_logger.exception("Manager worker failed", extra=log_dict, exc_info=True) # declare the job as failed. self.task_obj.result = json.dumps({'error': repr(e)}) self.task_obj.update_status(Task.STATUS_FAILED, set_time_completed=True) print('Done with management task')
def get_rows(es_params, request): try: buffer_ = StringIO() except: buffer_ = BytesIO() writer = csv.writer(buffer_) features = es_params['features'] writer.writerow(features) ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) es_m.set_query_parameter('from', es_params['examples_start']) q_size = es_params['num_examples'] if es_params[ 'num_examples'] <= ES_SCROLL_BATCH else ES_SCROLL_BATCH es_m.set_query_parameter('size', q_size) response = es_m.scroll() scroll_id = response['_scroll_id'] left = es_params['num_examples'] hits = response['hits']['hits'] while hits and left: rows = process_hits(hits, features, write=False) if left > len(rows): for row in rows: writer.writerow(row) yield _get_buffer_data(buffer_) left -= len(rows) response = es_m.scroll(scroll_id=scroll_id) hits = response['hits']['hits'] scroll_id = response['_scroll_id'] elif left == len(rows): for row in rows: writer.writerow(row) yield _get_buffer_data(buffer_) break else: for row in rows[:left]: writer.writerow(row) yield _get_buffer_data(buffer_) break
def index(request): template = loader.get_template('conceptualiser.html') lexicons = [] for lexicon in Lexicon.objects.all().filter(author=request.user): setattr(lexicon, 'size', Word.objects.all().filter(lexicon=lexicon.id).count()) lexicons.append(lexicon) methods = ["PCA", "TSNE", "MDS"] datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') return HttpResponse( template.render( { 'STATIC_URL': STATIC_URL, 'lexicons': lexicons, 'methods': methods, 'language_models': language_models, 'allowed_datasets': datasets }, request))
def index(request): indices = ES_Manager.get_indices() indices = sorted(indices, key=lambda x: x['index']) # sort alphabetically datasets = get_datasets(indices=indices) users = User.objects.all() users = annotate_users_with_permissions(users, datasets) template = loader.get_template('permission_admin.html') allowed_datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') return HttpResponse( template.render( { 'users': users, 'datasets': datasets, 'indices': indices, 'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'allowed_datasets': allowed_datasets, 'language_models': language_models }, request))
def index(request): template = loader.get_template('dataset_importer.html') jobs = DatasetImport.objects.all() archive_formats = collect_map_entries(extractor_map) single_document_formats = collect_map_entries(entity_reader_map) document_collection_formats = collect_map_entries(collection_reader_map) database_formats = collect_map_entries(database_reader_map) # preprocessors = collect_map_entries(preprocessor_map) # enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True] datasets = Datasets().get_allowed_datasets(request.user) language_models = Task.objects.filter( task_type=TaskTypes.TRAIN_MODEL.value).filter( status__iexact=Task.STATUS_COMPLETED).order_by('-pk') analyzers = ES_Manager.get_analyzers() context = { # 'enabled_input_types': DATASET_IMPORTER_CONF['enabled_input_types'], 'archive_formats': archive_formats, 'single_document_formats': single_document_formats, 'document_collection_formats': document_collection_formats, 'database_formats': database_formats, 'language_models': language_models, 'allowed_datasets': datasets, 'jobs': jobs, 'analyzers': analyzers # 'enabled_preprocessors': enabled_preprocessors } return HttpResponse(template.render(context, request))
def get_rows(es_params, request): try: buffer_ = StringIO() except: buffer_ = BytesIO() writer = csv.writer(buffer_) features = es_params['features'] writer.writerow(features) ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) es_m.set_query_parameter('from', es_params['examples_start']) q_size = es_params['num_examples'] if es_params['num_examples'] <= ES_SCROLL_BATCH else ES_SCROLL_BATCH es_m.set_query_parameter('size', q_size) response = es_m.scroll() scroll_id = response['_scroll_id'] left = es_params['num_examples'] hits = response['hits']['hits'] while hits and left: rows = process_hits(hits, features, write=False) if left > len(rows): for row in rows: writer.writerow(row) yield _get_buffer_data(buffer_) left -= len(rows) response = es_m.scroll(scroll_id=scroll_id) hits = response['hits']['hits'] scroll_id = response['_scroll_id'] elif left == len(rows): for row in rows: writer.writerow(row) yield _get_buffer_data(buffer_) break else: for row in rows[:left]: writer.writerow(row) yield _get_buffer_data(buffer_) break
def dashboard_endpoint(request): es_params = request.POST ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) query_dict = None if es_m.is_combined_query_empty() else es_m.combined_query['main'] indices = request.POST.get("chosen_index", None) if not indices: raise ValueError("Please import an index first.") dashboard = MultiSearcherDashboard(es_url=es_url, indices=indices, query_body=query_dict) query_result = dashboard.conduct_query() formated_result = dashboard.format_result(query_result) return JsonResponse(formated_result)