def get_data(self, corpus, corpus_id, token_id, kwic_len): tree_configs = self._conf.get_trees(corpus_id) tree_id = self._conf.get_tree_display_list(corpus_id)[0] conf = tree_configs[tree_id] raw_data = self._load_raw_sent(corpus, corpus_id, token_id, kwic_len, conf.all_attrs) parsed_data = self._parse_raw_sent(raw_data['data'], conf.all_attrs, self._conf.get_empty_value_placeholders(corpus_id)) fallback_parse = None for i in range(len(parsed_data)): if self.is_error_node(parsed_data[i]): replac = dict(parsed_data[i].result.items()) if fallback_parse is None: fallback_parse = self._fetch_fallback_info(corpus, corpus_id, token_id, kwic_len, conf.parent_attr, conf.attr_refs) if self.is_error_node(fallback_parse[i]): # even fallback is broken - nothing we can do raise BackendDataParseException('Failed to parse sentence') for k, v in parsed_data[i].result.items(): if k == conf.parent_attr or k in conf.attr_refs: replac[k] = fallback_parse[i][k] elif v is None: replac[k] = 'N/A' parsed_data[i] = replac if conf.root_node: parsed_data = [conf.root_node] + parsed_data self._decode_tree_data(parsed_data, conf.parent_attr, conf.attr_refs) tb = mbk.TreeBuilder() tree_data = tb.process(conf, parsed_data) template = UcnkTreeTemplate(tree_id, tree_data, raw_data['kwic_pos'], tree_configs) return template.export(), mbk.TreeNodeEncoder
def _parse_raw_sent(in_data, tree_attrs, empty_val_placeholders): """ Args: in_data (list of str): a string-encoded sentence and required attribute metadata (see _load_raw_sent()) tree_attrs (list of str): a list of attributes used by nodes/edges of the tree empty_val_placeholders (list of str): a list of values which may represent an empty value in a raw sentence data Returns (list of dict): a list of dict items representing tree nodes """ def import_raw_val(v): return None if v in empty_val_placeholders or v == '' else v data = [] for i in range(0, len(in_data), 4): parsed = [import_raw_val(x) for x in in_data[i + 2].split('/')] if len(parsed) > len(tree_attrs): item = dict(zip(tree_attrs, len(tree_attrs) * [None])) item['word'] = in_data[i] # In case of a parsing error we wrap a partial result into # an error and try later to fetch essential data only (= parent # and other references to other values). data.append(BackendDataParseException(result=item)) else: item = dict(zip(tree_attrs, parsed)) item['word'] = in_data[i] data.append(item) return data
def _parse_raw_sent(in_data, tree_attrs, empty_val_placeholders, multival_separ=None): """ Args: in_data (list of str): a string-encoded sentence and required attribute metadata (see _load_raw_sent()) tree_attrs (list of str): a list of attributes used by nodes/edges of the tree empty_val_placeholders (list of str): a list of values which may represent an empty value in a raw sentence data Returns (list of dict): a list of dict items representing tree nodes """ def import_raw_val(v): return None if v in empty_val_placeholders or v == '' else v def expand_multivals(values): if multival_separ: expanded = [] for v in values: expanded.append( v.split(multival_separ) if v is not None else [None]) ans = [] for i in range(0, max(len(x) for x in expanded)): row = [] for v in expanded: if len(v) > i: row.append(v[i]) else: row.append(v[0]) ans.append(row) return ans return [values] data = [] for i in range(0, len(in_data), 4): parsed_m = expand_multivals( [import_raw_val(x) for x in in_data[i + 2].split('/')]) for j, parsed in enumerate(parsed_m): if len(parsed) > len(tree_attrs): item = dict(list(zip(tree_attrs, len(tree_attrs) * [None]))) item['word'] = in_data[i] item['multival_flag'] = None # In case of a parsing error we wrap a partial result into # an error and try later to fetch essential data only (= parent # and other references to other values). data.append(BackendDataParseException(result=item)) else: item = dict(list(zip(tree_attrs, parsed))) item['word'] = in_data[i] if len(parsed_m) > 1: if j == 0: item['multival_flag'] = 'start' elif j == len(parsed_m) - 1: item['multival_flag'] = 'end' else: item['multival_flag'] = None data.append(item) return data