Example #1
0
    def _get_color(self, color_code_list):
        if not color_code_list:
            return 'none'

        if self._average_colors:
            red, green, blue = 0, 0, 0

            for color_code in color_code_list:
                # none encountered in color_code, if in _derive_highlight_data while has >= instead of >,
                # runs into this problem here, need to check if color code contains "none"
                if color_code.startswith('none'):
                    logger = LogManager(__name__, '_GET_COLOR')
                    logger.set_context('color_code_list', color_code_list)
                    logger.info('Highlighter color_code_list contained "none", returning "none"')
                    return 'none'
                else:
                    r, g, b = int(color_code[1:3], 16), int(color_code[3:5], 16), int(color_code[5:], 16)

                red += r
                green += g
                blue += b

            red = int(red/len(color_code_list))
            green = int(green/len(color_code_list))
            blue = int(blue/len(color_code_list))

            return "#%02x%02x%02x"%(red, green, blue)
        else:
            return Counter(color_code_list).most_common(1)[0][0]
Example #2
0
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)
    es_m.build(es_params)
    try:
        out = execute_search(es_m, es_params)
    except Exception as e:
        log_dict = {
            'task': 'SEARCH DOCUMENTS',
            'event': 'documents_queried_failed'
        }
        logging.getLogger(ERROR_LOGGER).error("Documents queried failed",
                                              extra=log_dict,
                                              exc_info=True)
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {
            'column_names': [],
            'aaData': [],
            'iTotalRecords': 0,
            'iTotalDisplayRecords': 0,
            'lag': 0
        }

    logger.set_context('query', es_m.get_combined_query())
    logger.set_context('user_name', request.user.username)
    logger.info('documents_queried')
    return out
Example #3
0
 def init_logs(self):
   # Create time series loggers
   if self.opt['logs'] is not None:
     self.log_manager = LogManager(self.logs_folder)
     self.loggers = self.get_ts_loggers()
     self.register_raw_logs()
     self.log_url = 'http://{}/deep-dashboard?id={}'.format(
         self.opt['localhost'], self.model_id)
     self.log.info('Visualization can be viewed at: {}'.format(self.log_url))
   else:
     self.loggers = None
Example #4
0
def delete(request):

    logger = LogManager(__name__, 'DELETE SEARCH')
    search_id = request.GET['pk']
    logger.set_context('user_name', request.user.username)
    logger.set_context('search_id', search_id)
    try:
        Search.objects.get(pk=search_id).delete()
        logger.info('search_deleted')

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.exception('search_deletion_failed')

    return HttpResponse(search_id)
Example #5
0
def mlt_query(request):
    logger = LogManager(__name__, 'SEARCH MLT')
    es_params = request.POST

    mlt_fields = [json.loads(field)['path'] for field in es_params.getlist('mlt_fields')]

    handle_negatives = request.POST['handle_negatives']
    docs_accepted = [a.strip() for a in request.POST['docs'].split('\n') if a]
    docs_rejected = [a.strip() for a in request.POST['docs_rejected'].split('\n') if a]

    # stopwords
    stopword_lexicon_ids = request.POST.getlist('mlt_stopword_lexicons')
    stopwords = []

    for lexicon_id in stopword_lexicon_ids:
        lexicon = Lexicon.objects.get(id=int(lexicon_id))
        words = Word.objects.filter(lexicon=lexicon)
        stopwords+=[word.wrd for word in words]

    ds = Datasets().activate_dataset(request.session)
    es_m = ds.build_manager(ES_Manager)
    es_m.build(es_params)

    response = es_m.more_like_this_search(mlt_fields,docs_accepted=docs_accepted,docs_rejected=docs_rejected,handle_negatives=handle_negatives,stopwords=stopwords)

    documents = []
    for hit in response['hits']['hits']:
        fields_content = get_fields_content(hit,mlt_fields)
        documents.append({'id':hit['_id'],'content':fields_content})

    template_params = {'STATIC_URL': STATIC_URL,
                       'URL_PREFIX': URL_PREFIX,
                       'documents':documents}
    template = loader.get_template('mlt_results.html')
    return HttpResponse(template.render(template_params, request))
Example #6
0
    def highlight(self, original_text, highlight_data, tagged_text=None):
        """highlight_data = [{'spans': [[1,7],[25,36]], 'name': 'LOC', 'value': '5', 'category': '[fact]', 'color': '#ababab'}]
        """
        logger = LogManager(__name__, 'HIGHLIGHT')

        # original_text = str(original_text)
        # if original_text == '':
            # logger.set_context('highlight_data', highlight_data)
            # logger.info('original_text was empty - "", with HL data')
            # return ''
        
        if tagged_text:
            if self._derive_spans:
                alignment = [char_idx for char_idx in range(len(original_text))]
                highlight_data.extend(self._derive_highlight_data(tagged_text))
                tagged_text = original_text
            else:
                alignment = self._align_texts(original_text, tagged_text)
        else:
            alignment = [char_idx for char_idx in range(len(original_text))]
            tagged_text = original_text

        spans_to_tags = self._get_tags_for_text_index(tagged_text, alignment, highlight_data)
        split_text = self._split_text_at_indices(tagged_text, [index for span, tag in spans_to_tags for index in span])

        return self._merge_text_and_tags(split_text, [tag for span, tag in spans_to_tags])
Example #7
0
def delete(request):
    post_data = json.loads(request.POST['data'])
    logger = LogManager(__name__, 'DELETE SEARCH')
    search_ids = post_data['pks']
    logger.set_context('user_name', request.user.username)
    logger.set_context('search_ids', search_ids)
    try:
        for search_id in search_ids:
            Search.objects.get(pk=search_id).delete()
            logger.info('search_deleted:' + search_id)

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.exception('search_deletion_failed')

    return HttpResponse()
Example #8
0
def update_model(request):
    logger = LogManager(__name__, 'CHANGE_SETTINGS')
    parameters = request.POST
    try:
        model = {
            "pk": parameters["model_pk"],
            "description": parameters["model_description"],
            "unique_id": parameters["model_uuid"]
        }
        request.session['model'] = model
        logger.clean_context()
        logger.set_context('user_name', request.user.username)
        logger.set_context('new_model', model)
        logger.info('model_updated')
        return HttpResponse(json.dumps({'status': 'success'}))
    except:
        return HttpResponse(json.dumps({'status': 'error'}))
Example #9
0
def export_pages(request):

    es_params = request.session.get('export_args')
    if es_params is not None:
        if es_params['num_examples'] == '*':
            response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv')
        else:
            response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv')

        response['Content-Disposition'] = 'attachment; filename="%s"' % (es_params['filename'])

        return response

    logger = LogManager(__name__, 'SEARCH CORPUS')
    logger.set_context('user_name', request.user.username)
    logger.error('export pages failed, parameters empty')
    return HttpResponse()
Example #10
0
File: views.py Project: ekt68/texta
def delete(request):
    post_data = json.loads(request.POST['data'])
    logger = LogManager(__name__, 'DELETE SEARCH')
    search_ids = post_data['pks']
    logger.set_context('user_name', request.user.username)
    logger.set_context('search_ids', search_ids)
    try:
        for search_id in search_ids:
            Search.objects.get(pk=search_id).delete()
            logger.info('search_deleted:' + search_id)

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.exception('search_deletion_failed')

    return HttpResponse()
Example #11
0
    def _get_color(self, color_code_list):
        if not color_code_list:
            return 'none'

        if self._average_colors:
            red, green, blue = 0, 0, 0

            for color_code in color_code_list:
                # none encountered in color_code, if in _derive_highlight_data while has >= instead of >,
                # runs into this problem here, need to check if color code contains "none"
                if color_code.startswith('none'):
                    logger = LogManager(__name__, '_GET_COLOR')
                    logger.set_context('color_code_list', color_code_list)
                    logger.info('Highlighter color_code_list contained "none", returning "none"')
                    return 'none'
                else:
                    r, g, b = int(color_code[1:3], 16), int(color_code[3:5], 16), int(color_code[5:], 16)

                red += r
                green += g
                blue += b

            red = int(red/len(color_code_list))
            green = int(green/len(color_code_list))
            blue = int(blue/len(color_code_list))

            return "#%02x%02x%02x"%(red, green, blue)
        else:
            return Counter(color_code_list).most_common(1)[0][0]
Example #12
0
    def _get_tags_for_text_index(self, text, alignment, highlight_data):
        logger = LogManager(__name__, '_GET_TAGS_FOR_TEXT_INDEX')
        data_mapping = {index: datum for index, datum in enumerate(highlight_data)}
        data_index_to_spans = [datum['spans'] for datum in highlight_data]

        text_index_to_data_index = [[] for i in range(len(text))]
        for data_index, spans in enumerate(data_index_to_spans):
            for span in spans:
                for text_index in range(*span):
                    try:
                        text_index_to_data_index[alignment[text_index]].append(data_index)
                    except Exception as e:
                        # Throws exception when index out of range
                        # For example Gets index out of range if in _derive_highlight_data while uses >= instead of >
                        print('-- Exception[{0}] {1}'.format(__name__, e))
                        logger.set_context('text', text)
                        logger.exception('_get_tags_for_text_index try catch execption')

                        

        text_index_to_data_index = [frozenset(data_indices) for data_indices in text_index_to_data_index]

        spans_to_tags = [(spans, self._get_tag_from_highlight_data([data_mapping[data_index] for data_index in data_indices]))
                         for spans, data_indices in self._get_spans_to_data_indices(text_index_to_data_index)]

        return spans_to_tags
Example #13
0
def update_dataset(request):
    logger = LogManager(__name__, 'CHANGE_SETTINGS')
    parameters = request.POST

    try:
        # TODO: check if is a valid mapping_id before change session[dataset]
        new_datasets = parameters.getlist('dataset[]')
        new_datasets = [
            new_dataset for new_dataset in new_datasets
            if request.user.has_perm('permission_admin.can_access_dataset_' +
                                     str(new_dataset))
        ]
        request.session['dataset'] = new_datasets

        logger.clean_context()
        logger.set_context('user_name', request.user.username)
        logger.set_context('new_datasets', new_datasets)
        logger.info('datasets_updated')

        ds = Datasets().activate_datasets(request.session)
        return HttpResponse(json.dumps({'status': 'success'}))
    except:
        return HttpResponse(json.dumps({'status': 'error'}))
Example #14
0
File: views.py Project: ekt68/texta
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)
    es_m.build(es_params)
    try:
        out = execute_search(es_m, es_params)
    except Exception as e:
        logging.getLogger(ERROR_LOGGER).error(
            json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True)
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}

    logger.set_context('query', es_m.get_combined_query())
    logger.set_context('user_name', request.user.username)
    logger.info('documents_queried')
    return out
Example #15
0
    def _get_tags_for_text_index(self, text, alignment, highlight_data):
        logger = LogManager(__name__, '_GET_TAGS_FOR_TEXT_INDEX')
        data_mapping = {index: datum for index, datum in enumerate(highlight_data)}
        data_index_to_spans = [datum['spans'] for datum in highlight_data]

        text_index_to_data_index = [[] for i in range(len(text))]
        for data_index, spans in enumerate(data_index_to_spans):
            for span in spans:
                for text_index in range(*span):
                    try:
                        text_index_to_data_index[alignment[text_index]].append(data_index)
                    except Exception as e:
                        # Throws exception when index out of range
                        # Possibly started happening when _derive_highlight_data started using >= instead of >
                        # As a result, the highlgihts will be slightly misaligned, but wont break the search(?)
                        # Also possibly caused by double quotes in text sometimes
                        # TODO something to handle this better
                        pass

        text_index_to_data_index = [frozenset(data_indices) for data_indices in text_index_to_data_index]
        spans_to_tags = [(spans, self._get_tag_from_highlight_data([data_mapping[data_index] for data_index in data_indices]))
                         for spans, data_indices in self._get_spans_to_data_indices(text_index_to_data_index)]

        return spans_to_tags
Example #16
0
def export_pages(request):

    es_params = request.session.get('export_args')
    if es_params is not None:
        if es_params['num_examples'] == '*':
            response = StreamingHttpResponse(get_all_rows(es_params, request),
                                             content_type='text/csv')
        else:
            response = StreamingHttpResponse(get_rows(es_params, request),
                                             content_type='text/csv')

        response['Content-Disposition'] = 'attachment; filename="%s"' % (
            es_params['filename'])

        return response

    logger = LogManager(__name__, 'SEARCH CORPUS')
    logger.set_context('user_name', request.user.username)
    logger.error('export pages failed, parameters empty')
    return HttpResponse()
Example #17
0
def save(request):
    logger = LogManager(__name__, 'SAVE SEARCH')

    ds = Datasets().activate_dataset(request.session)
    es_m = ds.build_manager(ES_Manager)

    es_params = request.POST
    es_m.build(es_params)
    combined_query = es_m.get_combined_query()

    try:
        q = combined_query
        desc = request.POST['search_description']
        s_content = json.dumps([request.POST[x] for x in request.POST.keys() if 'match_txt' in x])
        search = Search(author=request.user,search_content=s_content,description=desc,dataset=Dataset.objects.get(pk=int(request.session['dataset'])),query=json.dumps(q))
        search.save()
        logger.set_context('user_name', request.user.username)
        logger.set_context('search_id', search.id)
        logger.info('search_saved')

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('es_params', es_params)
        logger.exception('search_saving_failed')

    return HttpResponse()
Example #18
0
    log.info('Building model')
    m = box_model.get_model(model_opt, device=device)

    log.info('Loading dataset')
    dataset = trainer.get_dataset(args.dataset, data_opt)
    if model_opt['fixed_order']:
        dataset['train']['label_segmentation'] = trainer.sort_by_segm_size(
            dataset['train']['label_segmentation'])
        dataset['valid']['label_segmentation'] = trainer.sort_by_segm_size(
            dataset['valid']['label_segmentation'])

    sess = tf.Session()

    # Create time series loggers
    if train_opt['logs']:
        log_manager = LogManager(logs_folder)
        loggers = get_ts_loggers(model_opt, restore_step=step)
        trainer.register_raw_logs(log_manager, log, model_opt, saver)
        samples = get_plot_loggers(model_opt, train_opt)
        log_url = 'http://{}/deep-dashboard?id={}'.format(
            train_opt['localhost'], model_id)
        log.info('Visualization can be viewed at: {}'.format(log_url))

    # Restore/intialize weights
    if args.restore:
        saver.restore(sess, ckpt_fname)
    else:
        sess.run(tf.initialize_all_variables())

    batch_size = args.batch_size
    log.info('Batch size: {}'.format(batch_size))
Example #19
0
            name='Encoder hidden activation sparsity',
            buffer_size=1)
        hdec_sparsity_logger = TimeSeriesLogger(
            os.path.join(logs_folder, 'hdec_sparsity.csv'),
            'hdec sparsity',
            name='Decoder hidden activation sparsity',
            buffer_size=1)
        step_time_logger = TimeSeriesLogger(os.path.join(
            logs_folder, 'step_time.csv'),
                                            'step time (ms)',
                                            buffer_size=10)
        w1_image_fname = os.path.join(logs_folder, 'w1.png')
        decoder_image_fname = os.path.join(logs_folder, 'decoder.png')
        gen_image_fname = os.path.join(logs_folder, 'gen.png')
        registered_image = False
        log_manager = LogManager(logs_folder)
        log_manager.register(log.filename, 'plain', 'Raw logs')

        log.info('Curves can be viewed at: http://{}/visualizer?id={}'.format(
            args.localhost, model_id))
    else:
        log = logger.get()

    log.log_args()

    # Set device
    if args.gpu >= 0:
        device = '/gpu:{}'.format(args.gpu)
    else:
        device = '/cpu:0'
Example #20
0
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    try:

        start_time = time.time()
        out = {'column_names': [],
               'aaData': [],
               'iTotalRecords': 0,
               'iTotalDisplayRecords': 0,
               'lag': 0}

        ds = Datasets().activate_dataset(request.session)
        es_m = ds.build_manager(ES_Manager)
        es_m.build(es_params)

        # DEFINING THE EXAMPLE SIZE
        es_m.set_query_parameter('from', es_params['examples_start'])
        es_m.set_query_parameter('size', es_params['num_examples'])

        # HIGHLIGHTING THE MATCHING FIELDS
        pre_tag = '<span class="[HL]" style="background-color:#FFD119">'
        post_tag = "</span>"
        highlight_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]}
        for field in es_params:
            if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not':
                f = es_params[field]
                highlight_config['fields'][f] = {"number_of_fragments": 0}
        es_m.set_query_parameter('highlight', highlight_config)
        response = es_m.search()

        out['iTotalRecords'] = response['hits']['total']
        out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs

        if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
            out['iTotalDisplayRecords'] = '10000'

        # get columns names from ES mapping
        out['column_names'] = es_m.get_column_names()

        for hit in response['hits']['hits']:
            hit_id = str(hit['_id'])
            row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content

            inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {}
            name_to_inner_hits = defaultdict(list)
            for inner_hit_name, inner_hit in inner_hits.items():
                hit_type, _, _ = inner_hit_name.rsplit('_', 2)
                for inner_hit_hit in inner_hit['hits']['hits']:
                    source = inner_hit_hit['_source']
                    source['hit_type'] = hit_type
                    name_to_inner_hits[source['doc_path']].append(source)


            # Fill the row content respecting the order of the columns
            cols_data = {}
            for col in out['column_names']:

                # If the content is nested, need to break the flat name in a path list
                filed_path = col.split('.')

                # Get content for this field path:
                #   - Starts with the hit structure
                #   - For every field in field_path, retrieve the specific content
                #   - Repeat this until arrives at the last field
                #   - If the field in the field_path is not in this hit structure,
                #     make content empty (to allow dynamic mapping without breaking alignment)
                content = hit['_source']
                for p in filed_path:
                    if col == u'texta_facts' and p in content:
                        new_content = []
                        facts = ['{ "'+x["fact"]+'": "'+x["str_val"]+'"}' for x in sorted(content[p], key=lambda k: k['fact'])]
                        fact_counts = Counter(facts)

                        facts = sorted(list(set(facts)))
                        facts_dict = [json.loads(x) for x in facts]
                        for i, d in enumerate(facts_dict):
                            for k in d:
                                # Make factnames bold for searcher
                                if '<b>'+k+'</b>' not in new_content:
                                    new_content.append('<b>'+k+'</b>')
                                new_content.append('    {}: {}'.format(d[k], fact_counts[facts[i]]))
                        content = '\n'.join(new_content)
                    else:
                        content = content[p] if p in content else ''


                # To strip fields with whitespace in front
                try:
                    old_content = content.strip()
                except:
                    old_content = content

                # Substitute feature value with value highlighted by Elasticsearch
                if col in highlight_config['fields'] and 'highlight' in hit:
                    content = hit['highlight'][col][0] if col in hit['highlight'] else ''
                # Prettify and standardize highlights
                highlight_data = []
                if name_to_inner_hits[col]:
                    color_map = ColorPicker.get_color_map(keys={hit['fact'] for hit in name_to_inner_hits[col]})
                    for inner_hit in name_to_inner_hits[col]:
                        datum = {
                            'spans': json.loads(inner_hit['spans']),
                            'name': inner_hit['fact'],
                            'category': '[{0}]'.format(inner_hit['hit_type']),
                            'color': color_map[inner_hit['fact']]
                        }

                        if inner_hit['hit_type'] == 'fact_val':
                            datum['value'] = inner_hit['str_val']
                        highlight_data.append(datum)

                    content = Highlighter(average_colors=True, derive_spans=True,
                                              additional_style_string='font-weight: bold;').highlight(
                                                  old_content,
                                                  highlight_data,
                                                  tagged_text=content)
                # else:
                #     # WHEN USING OLD FORMAT DOCUMENTS, SOMETIMES BREAKS AT HIGHLIGHTER, CHECK IF ITS STRING INSTEAD OF FOR EXAMPLE LIST
                #     highlight_data = []
                #     if (isinstance(content, str)) or (isinstance(content, bytes)):
                #         content = Highlighter(average_colors=True, derive_spans=True,
                #                                 additional_style_string='font-weight: bold;').highlight(
                #                                     old_content,
                #                                     highlight_data,
                #                                     tagged_text=content)

                # Append the final content of this col to the row
                if(row[col] == ''):
                    row[col] = content

                cols_data[col] = {'highlight_data': highlight_data, 'content': content, 'old_content': old_content}



            # Transliterate the highlighting between different cols
            translit_search_cols = ['text', 'translit', 'lemmas']
            hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_search_cols] # To get value before '.' as well
            row = highlight_transliterately(cols_data, row, hl_cols=hl_cols)

            # Checks if user wants to see full text or short version
            for col in row:
                if 'show_short_version' in es_params.keys():
                    row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char'])

            out['aaData'].append(row.values())

            out['lag'] = time.time()-start_time
            logger.set_context('query', es_m.get_combined_query())
            logger.set_context('user_name', request.user.username)
            logger.info('documents_queried')

        return out

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).error(json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True)

        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}
        return out
Example #21
0
class TrainingExperimentBase(ExperimentBase):

  def init_logs(self):
    # Create time series loggers
    if self.opt['logs'] is not None:
      self.log_manager = LogManager(self.logs_folder)
      self.loggers = self.get_ts_loggers()
      self.register_raw_logs()
      self.log_url = 'http://{}/deep-dashboard?id={}'.format(
          self.opt['localhost'], self.model_id)
      self.log.info('Visualization can be viewed at: {}'.format(self.log_url))
    else:
      self.loggers = None

  def init_cmd_logger(self):
    # Logger
    if self.opt['logs'] is not None:
      self.logs_folder = self.opt['logs']
      self.logs_folder = os.path.join(self.logs_folder, self.model_id)
      self.log = logger.get(os.path.join(self.logs_folder, 'raw'))
    else:
      self.log = logger.get()

  def register_raw_logs(self):
    self.log_manager.register(self.log.filename, 'plain', 'Raw logs')
    cmd_fname = os.path.join(self.log_manager.folder, 'cmd.log')
    with open(cmd_fname, 'w') as f:
      f.write(' '.join(sys.argv))
    self.log_manager.register(cmd_fname, 'plain', 'Command-line arguments')
    model_opt_fname = os.path.join(self.log_manager.folder, 'model_opt.yaml')
    self.saver.save_opt(model_opt_fname, self.model_opt)
    self.log_manager.register(model_opt_fname, 'plain', 'Model hyperparameters')

  def get_dataset(self):
    dataset = {}
    dataset['train'] = data_provider.get(self.dataset_name,
                                         self.data_opt,
                                         split='train',
                                         h5_fname=self.opt['h5_fname_train'])
    dataset['valid'] = data_provider.get(self.dataset_name,
                                         self.data_opt,
                                         split='valid',
                                         h5_fname=self.opt['h5_fname_valid'])
    return dataset

  def get_ts_loggers(self):
    return {}

  def get_runner_trainval(self):
    return EmptyTrainer()

  def get_runner_train(self):
    return EmptyTrainer()

  def get_runner_valid(self):
    return EmptyTrainer()

  def get_runner_plot_train(self):
    return EmptyTrainer()

  def get_runner_plot_valid(self):
    return EmptyTrainer()

  def run(self):
    runner_trainval = self.get_runner_trainval()
    runner_train = self.get_runner_train()
    runner_plot_train = self.get_runner_plot_train()
    if self.opt['has_valid']:
      runner_valid = self.get_runner_valid()
      runner_plot_valid = self.get_runner_plot_valid()

    nstart = self.step.get()
    # Progress bar.
    it = tqdm(range(nstart, self.opt['num_steps']), desc=self.model_id)
    step_prev = self.step.get()
    while self.step.get() < self.opt['num_steps']:
      it.update(self.step.get() - step_prev)
      step_prev = self.step.get()

      # Plot samples
      if self.step.get() % self.opt['steps_per_plot'] == 0:
        self.log.info('Plot train samples')
        runner_plot_train.run_step()
        self.log.info('Plot valid samples')
        runner_plot_valid.run_step()

      # Train step
      runner_train.run_step()

      # Run validation stats
      if self.opt['has_valid']:
        if self.step.get() % self.opt['steps_per_valid'] == 0:
          self.log.info('Running validation')
          runner_valid.run_step()

      # Train stats
      if self.step.get() % self.opt['steps_per_trainval'] == 0:
        self.log.info('Running train validation')
        runner_trainval.run_step()

      # Save model
      if self.step.get() % self.opt['steps_per_ckpt'] == 0:
        if self.opt['save_ckpt']:
          self.log.info('Saving checkpoint')
          self.saver.save(self.sess, global_step=self.step.get())
        else:
          self.log.warning('Saving is turned off. Use -save_ckpt flag to save.')

    it.close()
    runner_train.finalize()
    runner_valid.finalize()
    runner_trainval.finalize()
    runner_plot_train.finalize()
    runner_plot_valid.finalize()
    self.sess.close()

    for self.logger in self.loggers.itervalues():
      self.logger.close()
Example #22
0
File: views.py Project: ekt68/texta
def update(request):
	logger = LogManager(__name__, 'CHANGE_SETTINGS')

	parameters = request.POST	
	if 'model_pk' in parameters:
		model = {"pk": parameters["model_pk"], "description": parameters["model_description"], "unique_id": parameters["model_uuid"]}
		request.session['model'] = model
		logger.clean_context()
		logger.set_context('user_name', request.user.username)
		logger.set_context('new_model', model)
		logger.info('model_updated')

	if 'dataset[]' in parameters:
		# TODO: check if is a valid mapping_id before change session[dataset]
		new_datasets = parameters.getlist('dataset[]')
		
		new_datasets = [new_dataset for new_dataset in new_datasets if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset))]

		request.session['dataset'] = new_datasets

		logger.clean_context()
		logger.set_context('user_name', request.user.username)
		logger.set_context('new_datasets', new_datasets)
		logger.info('datasets_updated')

		ds = Datasets().activate_datasets(request.session)
		#es_m = ds.build_manager(ES_Manager)

	return HttpResponseRedirect(URL_PREFIX + '/')
Example #23
0
    def remove_facts_from_document(self, rm_facts_dict, bs=7500):
        '''remove a certain fact from all documents given a [str]key and [str]val'''
        logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS')

        try:
            # Clears readonly block just in case the index has been set to read only
            self.es_m.clear_readonly_block()

            query = self._fact_deletion_query(rm_facts_dict)
            self.es_m.load_combined_query(query)
            response = self.es_m.scroll(size=bs, field_scroll=self.field)
            scroll_id = response['_scroll_id']
            total_docs = response['hits']['total']
            docs_left = total_docs  # DEBUG
            print('Starting.. Total docs - ', total_docs)  # DEBUG
            batch = 0
            while total_docs > 0:
                print('Docs left:', docs_left)  # DEBUG
                data = ''
                for document in response['hits']['hits']:
                    new_field = []  # The new facts field
                    for fact in document['_source'][self.field]:
                        # If the fact name is in rm_facts_dict keys
                        if fact["fact"] in rm_facts_dict:
                            # If the fact value is not in the delete key values
                            if fact['str_val'] not in rm_facts_dict.getlist(
                                    fact["fact"]):
                                new_field.append(fact)
                        else:
                            new_field.append(fact)
                    # Update dataset
                    data += json.dumps({
                        "update": {
                            "_id": document['_id'],
                            "_type": document['_type'],
                            "_index": document['_index']
                        }
                    }) + '\n'
                    document = {'doc': {self.field: new_field}}
                    data += json.dumps(document) + '\n'
                response = self.es_m.scroll(scroll_id=scroll_id,
                                            size=bs,
                                            field_scroll=self.field)
                total_docs = len(response['hits']['hits'])
                docs_left -= bs  # DEBUG
                scroll_id = response['_scroll_id']
                self.es_m.plain_post_bulk(self.es_m.es_url, data)
            print('DONE')  # DEBUG

            logger.set_context('docs_left', total_docs)
            logger.set_context('batch', batch)
            logger.info('remove_facts_from_document')
        except:
            print(traceback.format_exc())
            logger.set_context('es_params', self.es_params)
            logger.exception('remove_facts_from_document_failed')
Example #24
0
 def __init__(self):
     super(BaseEntity, self).__init__()
     self.logger = LogManager.get_logger(self.__class__.__name__)
Example #25
0
def save(request):
    logger = LogManager(__name__, 'SAVE SEARCH')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)

    es_params = request.POST
    es_m.build(es_params)
    combined_query = es_m.get_combined_query()

    try:
        q = combined_query
        desc = request.POST['search_description']
        s_content = {}

        # make json
        for x in request.POST.keys():
            if 'match_txt' in x:
                # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533
                field_id = x.rsplit("_", 1)[-1]
                match_field = request.POST['match_field_' + field_id]
                if match_field in s_content.keys():
                    s_content[match_field].append(request.POST[x])
                else:
                    s_content[match_field] = [request.POST[x]]

        search = Search(author=request.user,
                        search_content=json.dumps(s_content),
                        description=desc,
                        query=json.dumps(q))
        logger.info('Saving search for datasets: {}'.format(
            request.session['dataset']))
        search.save()
        for dataset_id in request.session['dataset']:
            dataset = Dataset.objects.get(pk=int(dataset_id))
            search.datasets.add(dataset)
        search.save()
        logger.set_context('user_name', request.user.username)
        logger.set_context('search_id', search.id)
        logger.info('search_saved')
    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('es_params', es_params)
        logger.exception('search_saving_failed')
    return HttpResponse()
Example #26
0
File: views.py Project: ekt68/texta
def save(request):
    logger = LogManager(__name__, 'SAVE SEARCH')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)

    es_params = request.POST
    es_m.build(es_params)
    combined_query = es_m.get_combined_query()

    try:
        q = combined_query
        desc = request.POST['search_description']
        s_content = {}

        # make json
        for x in request.POST.keys():
            if 'match_txt' in x:
                # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533
                field_id = x.rsplit("_", 1)[-1]
                match_field = request.POST['match_field_' + field_id]
                if match_field in s_content.keys():
                    s_content[match_field].append(request.POST[x])
                else:
                    s_content[match_field] = [request.POST[x]]

        search = Search(author=request.user, search_content=json.dumps(s_content), description=desc,
                        query=json.dumps(q))
        logger.info('Saving search for datasets: {}'.format(request.session['dataset']))
        search.save()
        for dataset_id in request.session['dataset']:
            dataset = Dataset.objects.get(pk=int(dataset_id))
            search.datasets.add(dataset)
        search.save()
        logger.set_context('user_name', request.user.username)
        logger.set_context('search_id', search.id)
        logger.info('search_saved')
    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('es_params', es_params)
        logger.exception('search_saving_failed')
    return HttpResponse()
Example #27
0
def update(request):
    logger = LogManager(__name__, 'CHANGE_SETTINGS')

    parameters = request.POST

    if 'model' in parameters:
        model = str(parameters['model'])
        request.session['model'] = model
        logger.clean_context()
        logger.set_context('user_name', request.user.username)
        logger.set_context('new_model', model)
        logger.info('dataset_updated')

    if 'dataset' in parameters:
        # TODO: check if is a valid mapping_id before change session[dataset]
        new_dataset = parameters['dataset']

        if request.user.has_perm('permission_admin.can_access_dataset_' +
                                 str(new_dataset)):
            request.session['dataset'] = new_dataset

            logger.clean_context()
            logger.set_context('user_name', request.user.username)
            logger.set_context('new_dataset', new_dataset)
            logger.info('dataset_updated')

        ds = Datasets().activate_dataset(request.session)
        es_m = ds.build_manager(ES_Manager)

    return HttpResponseRedirect(URL_PREFIX + '/')
Example #28
0
def facts_agg(es_params, request):
    logger = LogManager(__name__, 'FACTS AGGREGATION')

    distinct_values = []
    query_results = []
    lexicon = []
    aggregation_data = es_params['aggregate_over']
    aggregation_data = json.loads(aggregation_data)
    original_aggregation_field = aggregation_data['path']
    aggregation_field = 'texta_link.facts'

    try:
        aggregation_size = 50
        aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}},
                        "distinct_values": {"cardinality": {"field": aggregation_field}}}

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()
        date_range = ds.get_date_range()
        es_m = ES_Manager(dataset, mapping, date_range)

        for item in es_params:
            if 'saved_search' in item:
                s = Search.objects.get(pk=es_params[item])
                name = s.description
                saved_query = json.loads(s.query)
                es_m.load_combined_query(saved_query)
                es_m.set_query_parameter('aggs', aggregations)
                response = es_m.search()

                # Filter response
                bucket_filter = '{0}.'.format(original_aggregation_field.lower())
                final_bucket = []
                for b in response['aggregations']['strings']['buckets']:
                    if bucket_filter in b['key']:
                        fact_name = b['key'].split('.')[-1]
                        b['key'] = fact_name
                        final_bucket.append(b)
                final_bucket = final_bucket[:aggregation_size]
                response['aggregations']['distinct_values']['value'] = len(final_bucket)
                response['aggregations']['strings']['buckets'] = final_bucket

                normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
                lexicon = list(set(lexicon+labels))
                query_results.append({'name':name,'data':normalised_counts,'labels':labels})
                distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']})


        es_m.build(es_params)
        # FIXME
        # this is confusing for the user
        if not es_m.is_combined_query_empty():
            es_m.set_query_parameter('aggs', aggregations)
            response = es_m.search()

            # Filter response
            bucket_filter = '{0}.'.format(original_aggregation_field.lower())
            final_bucket = []
            for b in response['aggregations']['strings']['buckets']:
                if bucket_filter in b['key']:
                    fact_name = b['key'].split('.')[-1]
                    b['key'] = fact_name
                    final_bucket.append(b)
            final_bucket = final_bucket[:aggregation_size]
            response['aggregations']['distinct_values']['value'] = len(final_bucket)
            response['aggregations']['strings']['buckets'] = final_bucket

            normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
            lexicon = list(set(lexicon+labels))
            query_results.append({'name':'Query','data':normalised_counts,'labels':labels})
            distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']})

        data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))]
        data = [['Word']+[query_result['name'] for query_result in query_results]]+data

        for i,word in enumerate(lexicon):
            for j,query_result in enumerate(query_results):
                for k,label in enumerate(query_result['labels']):
                    if word == label:
                        data[i+1][j+1] = query_result['data'][k]

        logger.set_context('user_name', request.user.username)
        logger.info('facts_aggregation_queried')

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.exception('facts_aggregation_query_failed')

    table_height = len(data)*15
    table_height = table_height if table_height > 500 else 500
    return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}