def _get_color(self, color_code_list): if not color_code_list: return 'none' if self._average_colors: red, green, blue = 0, 0, 0 for color_code in color_code_list: # none encountered in color_code, if in _derive_highlight_data while has >= instead of >, # runs into this problem here, need to check if color code contains "none" if color_code.startswith('none'): logger = LogManager(__name__, '_GET_COLOR') logger.set_context('color_code_list', color_code_list) logger.info('Highlighter color_code_list contained "none", returning "none"') return 'none' else: r, g, b = int(color_code[1:3], 16), int(color_code[3:5], 16), int(color_code[5:], 16) red += r green += g blue += b red = int(red/len(color_code_list)) green = int(green/len(color_code_list)) blue = int(blue/len(color_code_list)) return "#%02x%02x%02x"%(red, green, blue) else: return Counter(color_code_list).most_common(1)[0][0]
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: log_dict = { 'task': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed' } logging.getLogger(ERROR_LOGGER).error("Documents queried failed", extra=log_dict, exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = { 'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0 } logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def init_logs(self): # Create time series loggers if self.opt['logs'] is not None: self.log_manager = LogManager(self.logs_folder) self.loggers = self.get_ts_loggers() self.register_raw_logs() self.log_url = 'http://{}/deep-dashboard?id={}'.format( self.opt['localhost'], self.model_id) self.log.info('Visualization can be viewed at: {}'.format(self.log_url)) else: self.loggers = None
def delete(request): logger = LogManager(__name__, 'DELETE SEARCH') search_id = request.GET['pk'] logger.set_context('user_name', request.user.username) logger.set_context('search_id', search_id) try: Search.objects.get(pk=search_id).delete() logger.info('search_deleted') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.exception('search_deletion_failed') return HttpResponse(search_id)
def mlt_query(request): logger = LogManager(__name__, 'SEARCH MLT') es_params = request.POST mlt_fields = [json.loads(field)['path'] for field in es_params.getlist('mlt_fields')] handle_negatives = request.POST['handle_negatives'] docs_accepted = [a.strip() for a in request.POST['docs'].split('\n') if a] docs_rejected = [a.strip() for a in request.POST['docs_rejected'].split('\n') if a] # stopwords stopword_lexicon_ids = request.POST.getlist('mlt_stopword_lexicons') stopwords = [] for lexicon_id in stopword_lexicon_ids: lexicon = Lexicon.objects.get(id=int(lexicon_id)) words = Word.objects.filter(lexicon=lexicon) stopwords+=[word.wrd for word in words] ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) response = es_m.more_like_this_search(mlt_fields,docs_accepted=docs_accepted,docs_rejected=docs_rejected,handle_negatives=handle_negatives,stopwords=stopwords) documents = [] for hit in response['hits']['hits']: fields_content = get_fields_content(hit,mlt_fields) documents.append({'id':hit['_id'],'content':fields_content}) template_params = {'STATIC_URL': STATIC_URL, 'URL_PREFIX': URL_PREFIX, 'documents':documents} template = loader.get_template('mlt_results.html') return HttpResponse(template.render(template_params, request))
def highlight(self, original_text, highlight_data, tagged_text=None): """highlight_data = [{'spans': [[1,7],[25,36]], 'name': 'LOC', 'value': '5', 'category': '[fact]', 'color': '#ababab'}] """ logger = LogManager(__name__, 'HIGHLIGHT') # original_text = str(original_text) # if original_text == '': # logger.set_context('highlight_data', highlight_data) # logger.info('original_text was empty - "", with HL data') # return '' if tagged_text: if self._derive_spans: alignment = [char_idx for char_idx in range(len(original_text))] highlight_data.extend(self._derive_highlight_data(tagged_text)) tagged_text = original_text else: alignment = self._align_texts(original_text, tagged_text) else: alignment = [char_idx for char_idx in range(len(original_text))] tagged_text = original_text spans_to_tags = self._get_tags_for_text_index(tagged_text, alignment, highlight_data) split_text = self._split_text_at_indices(tagged_text, [index for span, tag in spans_to_tags for index in span]) return self._merge_text_and_tags(split_text, [tag for span, tag in spans_to_tags])
def delete(request): post_data = json.loads(request.POST['data']) logger = LogManager(__name__, 'DELETE SEARCH') search_ids = post_data['pks'] logger.set_context('user_name', request.user.username) logger.set_context('search_ids', search_ids) try: for search_id in search_ids: Search.objects.get(pk=search_id).delete() logger.info('search_deleted:' + search_id) except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.exception('search_deletion_failed') return HttpResponse()
def update_model(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST try: model = { "pk": parameters["model_pk"], "description": parameters["model_description"], "unique_id": parameters["model_uuid"] } request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('model_updated') return HttpResponse(json.dumps({'status': 'success'})) except: return HttpResponse(json.dumps({'status': 'error'}))
def export_pages(request): es_params = request.session.get('export_args') if es_params is not None: if es_params['num_examples'] == '*': response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv') else: response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % (es_params['filename']) return response logger = LogManager(__name__, 'SEARCH CORPUS') logger.set_context('user_name', request.user.username) logger.error('export pages failed, parameters empty') return HttpResponse()
def delete(request): post_data = json.loads(request.POST['data']) logger = LogManager(__name__, 'DELETE SEARCH') search_ids = post_data['pks'] logger.set_context('user_name', request.user.username) logger.set_context('search_ids', search_ids) try: for search_id in search_ids: Search.objects.get(pk=search_id).delete() logger.info('search_deleted:' + search_id) except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.exception('search_deletion_failed') return HttpResponse()
def _get_color(self, color_code_list): if not color_code_list: return 'none' if self._average_colors: red, green, blue = 0, 0, 0 for color_code in color_code_list: # none encountered in color_code, if in _derive_highlight_data while has >= instead of >, # runs into this problem here, need to check if color code contains "none" if color_code.startswith('none'): logger = LogManager(__name__, '_GET_COLOR') logger.set_context('color_code_list', color_code_list) logger.info('Highlighter color_code_list contained "none", returning "none"') return 'none' else: r, g, b = int(color_code[1:3], 16), int(color_code[3:5], 16), int(color_code[5:], 16) red += r green += g blue += b red = int(red/len(color_code_list)) green = int(green/len(color_code_list)) blue = int(blue/len(color_code_list)) return "#%02x%02x%02x"%(red, green, blue) else: return Counter(color_code_list).most_common(1)[0][0]
def _get_tags_for_text_index(self, text, alignment, highlight_data): logger = LogManager(__name__, '_GET_TAGS_FOR_TEXT_INDEX') data_mapping = {index: datum for index, datum in enumerate(highlight_data)} data_index_to_spans = [datum['spans'] for datum in highlight_data] text_index_to_data_index = [[] for i in range(len(text))] for data_index, spans in enumerate(data_index_to_spans): for span in spans: for text_index in range(*span): try: text_index_to_data_index[alignment[text_index]].append(data_index) except Exception as e: # Throws exception when index out of range # For example Gets index out of range if in _derive_highlight_data while uses >= instead of > print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('text', text) logger.exception('_get_tags_for_text_index try catch execption') text_index_to_data_index = [frozenset(data_indices) for data_indices in text_index_to_data_index] spans_to_tags = [(spans, self._get_tag_from_highlight_data([data_mapping[data_index] for data_index in data_indices])) for spans, data_indices in self._get_spans_to_data_indices(text_index_to_data_index)] return spans_to_tags
def update_dataset(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST try: # TODO: check if is a valid mapping_id before change session[dataset] new_datasets = parameters.getlist('dataset[]') new_datasets = [ new_dataset for new_dataset in new_datasets if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset)) ] request.session['dataset'] = new_datasets logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_datasets', new_datasets) logger.info('datasets_updated') ds = Datasets().activate_datasets(request.session) return HttpResponse(json.dumps({'status': 'success'})) except: return HttpResponse(json.dumps({'status': 'error'}))
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: logging.getLogger(ERROR_LOGGER).error( json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def _get_tags_for_text_index(self, text, alignment, highlight_data): logger = LogManager(__name__, '_GET_TAGS_FOR_TEXT_INDEX') data_mapping = {index: datum for index, datum in enumerate(highlight_data)} data_index_to_spans = [datum['spans'] for datum in highlight_data] text_index_to_data_index = [[] for i in range(len(text))] for data_index, spans in enumerate(data_index_to_spans): for span in spans: for text_index in range(*span): try: text_index_to_data_index[alignment[text_index]].append(data_index) except Exception as e: # Throws exception when index out of range # Possibly started happening when _derive_highlight_data started using >= instead of > # As a result, the highlgihts will be slightly misaligned, but wont break the search(?) # Also possibly caused by double quotes in text sometimes # TODO something to handle this better pass text_index_to_data_index = [frozenset(data_indices) for data_indices in text_index_to_data_index] spans_to_tags = [(spans, self._get_tag_from_highlight_data([data_mapping[data_index] for data_index in data_indices])) for spans, data_indices in self._get_spans_to_data_indices(text_index_to_data_index)] return spans_to_tags
def export_pages(request): es_params = request.session.get('export_args') if es_params is not None: if es_params['num_examples'] == '*': response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv') else: response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % ( es_params['filename']) return response logger = LogManager(__name__, 'SEARCH CORPUS') logger.set_context('user_name', request.user.username) logger.error('export pages failed, parameters empty') return HttpResponse()
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = json.dumps([request.POST[x] for x in request.POST.keys() if 'match_txt' in x]) search = Search(author=request.user,search_content=s_content,description=desc,dataset=Dataset.objects.get(pk=int(request.session['dataset'])),query=json.dumps(q)) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
log.info('Building model') m = box_model.get_model(model_opt, device=device) log.info('Loading dataset') dataset = trainer.get_dataset(args.dataset, data_opt) if model_opt['fixed_order']: dataset['train']['label_segmentation'] = trainer.sort_by_segm_size( dataset['train']['label_segmentation']) dataset['valid']['label_segmentation'] = trainer.sort_by_segm_size( dataset['valid']['label_segmentation']) sess = tf.Session() # Create time series loggers if train_opt['logs']: log_manager = LogManager(logs_folder) loggers = get_ts_loggers(model_opt, restore_step=step) trainer.register_raw_logs(log_manager, log, model_opt, saver) samples = get_plot_loggers(model_opt, train_opt) log_url = 'http://{}/deep-dashboard?id={}'.format( train_opt['localhost'], model_id) log.info('Visualization can be viewed at: {}'.format(log_url)) # Restore/intialize weights if args.restore: saver.restore(sess, ckpt_fname) else: sess.run(tf.initialize_all_variables()) batch_size = args.batch_size log.info('Batch size: {}'.format(batch_size))
name='Encoder hidden activation sparsity', buffer_size=1) hdec_sparsity_logger = TimeSeriesLogger( os.path.join(logs_folder, 'hdec_sparsity.csv'), 'hdec sparsity', name='Decoder hidden activation sparsity', buffer_size=1) step_time_logger = TimeSeriesLogger(os.path.join( logs_folder, 'step_time.csv'), 'step time (ms)', buffer_size=10) w1_image_fname = os.path.join(logs_folder, 'w1.png') decoder_image_fname = os.path.join(logs_folder, 'decoder.png') gen_image_fname = os.path.join(logs_folder, 'gen.png') registered_image = False log_manager = LogManager(logs_folder) log_manager.register(log.filename, 'plain', 'Raw logs') log.info('Curves can be viewed at: http://{}/visualizer?id={}'.format( args.localhost, model_id)) else: log = logger.get() log.log_args() # Set device if args.gpu >= 0: device = '/gpu:{}'.format(args.gpu) else: device = '/cpu:0'
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') try: start_time = time.time() out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) # DEFINING THE EXAMPLE SIZE es_m.set_query_parameter('from', es_params['examples_start']) es_m.set_query_parameter('size', es_params['num_examples']) # HIGHLIGHTING THE MATCHING FIELDS pre_tag = '<span class="[HL]" style="background-color:#FFD119">' post_tag = "</span>" highlight_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]} for field in es_params: if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not': f = es_params[field] highlight_config['fields'][f] = {"number_of_fragments": 0} es_m.set_query_parameter('highlight', highlight_config) response = es_m.search() out['iTotalRecords'] = response['hits']['total'] out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit out['iTotalDisplayRecords'] = '10000' # get columns names from ES mapping out['column_names'] = es_m.get_column_names() for hit in response['hits']['hits']: hit_id = str(hit['_id']) row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {} name_to_inner_hits = defaultdict(list) for inner_hit_name, inner_hit in inner_hits.items(): hit_type, _, _ = inner_hit_name.rsplit('_', 2) for inner_hit_hit in inner_hit['hits']['hits']: source = inner_hit_hit['_source'] source['hit_type'] = hit_type name_to_inner_hits[source['doc_path']].append(source) # Fill the row content respecting the order of the columns cols_data = {} for col in out['column_names']: # If the content is nested, need to break the flat name in a path list filed_path = col.split('.') # Get content for this field path: # - Starts with the hit structure # - For every field in field_path, retrieve the specific content # - Repeat this until arrives at the last field # - If the field in the field_path is not in this hit structure, # make content empty (to allow dynamic mapping without breaking alignment) content = hit['_source'] for p in filed_path: if col == u'texta_facts' and p in content: new_content = [] facts = ['{ "'+x["fact"]+'": "'+x["str_val"]+'"}' for x in sorted(content[p], key=lambda k: k['fact'])] fact_counts = Counter(facts) facts = sorted(list(set(facts))) facts_dict = [json.loads(x) for x in facts] for i, d in enumerate(facts_dict): for k in d: # Make factnames bold for searcher if '<b>'+k+'</b>' not in new_content: new_content.append('<b>'+k+'</b>') new_content.append(' {}: {}'.format(d[k], fact_counts[facts[i]])) content = '\n'.join(new_content) else: content = content[p] if p in content else '' # To strip fields with whitespace in front try: old_content = content.strip() except: old_content = content # Substitute feature value with value highlighted by Elasticsearch if col in highlight_config['fields'] and 'highlight' in hit: content = hit['highlight'][col][0] if col in hit['highlight'] else '' # Prettify and standardize highlights highlight_data = [] if name_to_inner_hits[col]: color_map = ColorPicker.get_color_map(keys={hit['fact'] for hit in name_to_inner_hits[col]}) for inner_hit in name_to_inner_hits[col]: datum = { 'spans': json.loads(inner_hit['spans']), 'name': inner_hit['fact'], 'category': '[{0}]'.format(inner_hit['hit_type']), 'color': color_map[inner_hit['fact']] } if inner_hit['hit_type'] == 'fact_val': datum['value'] = inner_hit['str_val'] highlight_data.append(datum) content = Highlighter(average_colors=True, derive_spans=True, additional_style_string='font-weight: bold;').highlight( old_content, highlight_data, tagged_text=content) # else: # # WHEN USING OLD FORMAT DOCUMENTS, SOMETIMES BREAKS AT HIGHLIGHTER, CHECK IF ITS STRING INSTEAD OF FOR EXAMPLE LIST # highlight_data = [] # if (isinstance(content, str)) or (isinstance(content, bytes)): # content = Highlighter(average_colors=True, derive_spans=True, # additional_style_string='font-weight: bold;').highlight( # old_content, # highlight_data, # tagged_text=content) # Append the final content of this col to the row if(row[col] == ''): row[col] = content cols_data[col] = {'highlight_data': highlight_data, 'content': content, 'old_content': old_content} # Transliterate the highlighting between different cols translit_search_cols = ['text', 'translit', 'lemmas'] hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_search_cols] # To get value before '.' as well row = highlight_transliterately(cols_data, row, hl_cols=hl_cols) # Checks if user wants to see full text or short version for col in row: if 'show_short_version' in es_params.keys(): row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char']) out['aaData'].append(row.values()) out['lag'] = time.time()-start_time logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out except Exception as e: logging.getLogger(ERROR_LOGGER).error(json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} return out
class TrainingExperimentBase(ExperimentBase): def init_logs(self): # Create time series loggers if self.opt['logs'] is not None: self.log_manager = LogManager(self.logs_folder) self.loggers = self.get_ts_loggers() self.register_raw_logs() self.log_url = 'http://{}/deep-dashboard?id={}'.format( self.opt['localhost'], self.model_id) self.log.info('Visualization can be viewed at: {}'.format(self.log_url)) else: self.loggers = None def init_cmd_logger(self): # Logger if self.opt['logs'] is not None: self.logs_folder = self.opt['logs'] self.logs_folder = os.path.join(self.logs_folder, self.model_id) self.log = logger.get(os.path.join(self.logs_folder, 'raw')) else: self.log = logger.get() def register_raw_logs(self): self.log_manager.register(self.log.filename, 'plain', 'Raw logs') cmd_fname = os.path.join(self.log_manager.folder, 'cmd.log') with open(cmd_fname, 'w') as f: f.write(' '.join(sys.argv)) self.log_manager.register(cmd_fname, 'plain', 'Command-line arguments') model_opt_fname = os.path.join(self.log_manager.folder, 'model_opt.yaml') self.saver.save_opt(model_opt_fname, self.model_opt) self.log_manager.register(model_opt_fname, 'plain', 'Model hyperparameters') def get_dataset(self): dataset = {} dataset['train'] = data_provider.get(self.dataset_name, self.data_opt, split='train', h5_fname=self.opt['h5_fname_train']) dataset['valid'] = data_provider.get(self.dataset_name, self.data_opt, split='valid', h5_fname=self.opt['h5_fname_valid']) return dataset def get_ts_loggers(self): return {} def get_runner_trainval(self): return EmptyTrainer() def get_runner_train(self): return EmptyTrainer() def get_runner_valid(self): return EmptyTrainer() def get_runner_plot_train(self): return EmptyTrainer() def get_runner_plot_valid(self): return EmptyTrainer() def run(self): runner_trainval = self.get_runner_trainval() runner_train = self.get_runner_train() runner_plot_train = self.get_runner_plot_train() if self.opt['has_valid']: runner_valid = self.get_runner_valid() runner_plot_valid = self.get_runner_plot_valid() nstart = self.step.get() # Progress bar. it = tqdm(range(nstart, self.opt['num_steps']), desc=self.model_id) step_prev = self.step.get() while self.step.get() < self.opt['num_steps']: it.update(self.step.get() - step_prev) step_prev = self.step.get() # Plot samples if self.step.get() % self.opt['steps_per_plot'] == 0: self.log.info('Plot train samples') runner_plot_train.run_step() self.log.info('Plot valid samples') runner_plot_valid.run_step() # Train step runner_train.run_step() # Run validation stats if self.opt['has_valid']: if self.step.get() % self.opt['steps_per_valid'] == 0: self.log.info('Running validation') runner_valid.run_step() # Train stats if self.step.get() % self.opt['steps_per_trainval'] == 0: self.log.info('Running train validation') runner_trainval.run_step() # Save model if self.step.get() % self.opt['steps_per_ckpt'] == 0: if self.opt['save_ckpt']: self.log.info('Saving checkpoint') self.saver.save(self.sess, global_step=self.step.get()) else: self.log.warning('Saving is turned off. Use -save_ckpt flag to save.') it.close() runner_train.finalize() runner_valid.finalize() runner_trainval.finalize() runner_plot_train.finalize() runner_plot_valid.finalize() self.sess.close() for self.logger in self.loggers.itervalues(): self.logger.close()
def update(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST if 'model_pk' in parameters: model = {"pk": parameters["model_pk"], "description": parameters["model_description"], "unique_id": parameters["model_uuid"]} request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('model_updated') if 'dataset[]' in parameters: # TODO: check if is a valid mapping_id before change session[dataset] new_datasets = parameters.getlist('dataset[]') new_datasets = [new_dataset for new_dataset in new_datasets if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset))] request.session['dataset'] = new_datasets logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_datasets', new_datasets) logger.info('datasets_updated') ds = Datasets().activate_datasets(request.session) #es_m = ds.build_manager(ES_Manager) return HttpResponseRedirect(URL_PREFIX + '/')
def remove_facts_from_document(self, rm_facts_dict, bs=7500): '''remove a certain fact from all documents given a [str]key and [str]val''' logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS') try: # Clears readonly block just in case the index has been set to read only self.es_m.clear_readonly_block() query = self._fact_deletion_query(rm_facts_dict) self.es_m.load_combined_query(query) response = self.es_m.scroll(size=bs, field_scroll=self.field) scroll_id = response['_scroll_id'] total_docs = response['hits']['total'] docs_left = total_docs # DEBUG print('Starting.. Total docs - ', total_docs) # DEBUG batch = 0 while total_docs > 0: print('Docs left:', docs_left) # DEBUG data = '' for document in response['hits']['hits']: new_field = [] # The new facts field for fact in document['_source'][self.field]: # If the fact name is in rm_facts_dict keys if fact["fact"] in rm_facts_dict: # If the fact value is not in the delete key values if fact['str_val'] not in rm_facts_dict.getlist( fact["fact"]): new_field.append(fact) else: new_field.append(fact) # Update dataset data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: new_field}} data += json.dumps(document) + '\n' response = self.es_m.scroll(scroll_id=scroll_id, size=bs, field_scroll=self.field) total_docs = len(response['hits']['hits']) docs_left -= bs # DEBUG scroll_id = response['_scroll_id'] self.es_m.plain_post_bulk(self.es_m.es_url, data) print('DONE') # DEBUG logger.set_context('docs_left', total_docs) logger.set_context('batch', batch) logger.info('remove_facts_from_document') except: print(traceback.format_exc()) logger.set_context('es_params', self.es_params) logger.exception('remove_facts_from_document_failed')
def __init__(self): super(BaseEntity, self).__init__() self.logger = LogManager.get_logger(self.__class__.__name__)
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = {} # make json for x in request.POST.keys(): if 'match_txt' in x: # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533 field_id = x.rsplit("_", 1)[-1] match_field = request.POST['match_field_' + field_id] if match_field in s_content.keys(): s_content[match_field].append(request.POST[x]) else: s_content[match_field] = [request.POST[x]] search = Search(author=request.user, search_content=json.dumps(s_content), description=desc, query=json.dumps(q)) logger.info('Saving search for datasets: {}'.format( request.session['dataset'])) search.save() for dataset_id in request.session['dataset']: dataset = Dataset.objects.get(pk=int(dataset_id)) search.datasets.add(dataset) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = {} # make json for x in request.POST.keys(): if 'match_txt' in x: # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533 field_id = x.rsplit("_", 1)[-1] match_field = request.POST['match_field_' + field_id] if match_field in s_content.keys(): s_content[match_field].append(request.POST[x]) else: s_content[match_field] = [request.POST[x]] search = Search(author=request.user, search_content=json.dumps(s_content), description=desc, query=json.dumps(q)) logger.info('Saving search for datasets: {}'.format(request.session['dataset'])) search.save() for dataset_id in request.session['dataset']: dataset = Dataset.objects.get(pk=int(dataset_id)) search.datasets.add(dataset) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def update(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST if 'model' in parameters: model = str(parameters['model']) request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('dataset_updated') if 'dataset' in parameters: # TODO: check if is a valid mapping_id before change session[dataset] new_dataset = parameters['dataset'] if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset)): request.session['dataset'] = new_dataset logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_dataset', new_dataset) logger.info('dataset_updated') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) return HttpResponseRedirect(URL_PREFIX + '/')
def facts_agg(es_params, request): logger = LogManager(__name__, 'FACTS AGGREGATION') distinct_values = [] query_results = [] lexicon = [] aggregation_data = es_params['aggregate_over'] aggregation_data = json.loads(aggregation_data) original_aggregation_field = aggregation_data['path'] aggregation_field = 'texta_link.facts' try: aggregation_size = 50 aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}}, "distinct_values": {"cardinality": {"field": aggregation_field}}} # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() date_range = ds.get_date_range() es_m = ES_Manager(dataset, mapping, date_range) for item in es_params: if 'saved_search' in item: s = Search.objects.get(pk=es_params[item]) name = s.description saved_query = json.loads(s.query) es_m.load_combined_query(saved_query) es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':name,'data':normalised_counts,'labels':labels}) distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']}) es_m.build(es_params) # FIXME # this is confusing for the user if not es_m.is_combined_query_empty(): es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':'Query','data':normalised_counts,'labels':labels}) distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']}) data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))] data = [['Word']+[query_result['name'] for query_result in query_results]]+data for i,word in enumerate(lexicon): for j,query_result in enumerate(query_results): for k,label in enumerate(query_result['labels']): if word == label: data[i+1][j+1] = query_result['data'][k] logger.set_context('user_name', request.user.username) logger.info('facts_aggregation_queried') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.exception('facts_aggregation_query_failed') table_height = len(data)*15 table_height = table_height if table_height > 500 else 500 return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}