def predict(self, doc: Document, **kwargs) -> Document: unpack = False if self.input_key: if isinstance(self.input_key, (tuple, list)): if isinstance(self.component, LambdaComponent): # assume functions take multiple arguments input = [doc[key] for key in self.input_key] unpack = True else: input = list(list(zip(*sent)) for sent in zip(*[doc[key] for key in self.input_key])) else: input = doc[self.input_key] else: input = doc if self.kwargs: kwargs.update(self.kwargs) if unpack: kwargs['_elit_unpack'] = True output = self.component(input, **kwargs) if isinstance(output, types.GeneratorType): output = list(output) if self.output_key: if not isinstance(doc, Document): doc = Document() if isinstance(self.output_key, tuple): for key, value in zip(self.output_key, output): doc[key] = value else: doc[self.output_key] = output return doc return output
def parse( self, text: Union[str, List[str]] = None, tokens: List[List[str]] = None, models=("lem", "pos", "ner", "con", "dep", "srl", "amr", "dcr", "ocr"), speaker_ids: Union[int, List[int]] = None, genre: str = None, coref_context: dict = None, return_coref_prob: bool = False, language='en', verbose=True, ) -> Document: assert text or tokens, 'At least one of text or tokens has to be specified.' response = self._send_post_json( self.url + '/parse', { 'text': text, 'tokens': tokens, 'models': models, 'speaker_ids': speaker_ids, 'genre': genre, 'coref_context': coref_context, 'return_coref_prob': return_coref_prob, 'language': language, 'verbose': verbose }) return Document(response)
def finalize_document(self, doc: Document, task_name: str): pos_key = prefix_match('pos', doc) pos: List[List[str]] = doc.get(pos_key, None) if pos: for tree, pos_per_sent in zip(doc[task_name], pos): tree: Tree = tree offset = 0 for subtree in tree.subtrees(lambda t: t.height() == 2): tag = subtree.label() if tag == '_': subtree.set_label(pos_per_sent[offset]) offset += 1
def parse(self, inputs: List[Input]) -> List[Document]: self.service_tokenizer.tokenize_inputs(inputs) # no effects (read-only) in server pipeline # We shall group by models inputs_by_tasks = defaultdict(list) for i, input in enumerate(inputs): tasks = tuple(sorted(input.models)) inputs_by_tasks[tasks].append(i) results = [Document() for _ in inputs] for tasks, input_ids in inputs_by_tasks.items(): group_inputs = [inputs[i] for i in input_ids] group_tokens = sum([input.tokens for input in group_inputs], []) annotations = self.parse_sents(group_tokens, tasks) for k, v in annotations.items(): # fit ELIT standard if k == 'ner': for j, s in enumerate(v): v[j] = [x[1:] + x[:1] for x in s] elif k == 'srl': for _v in v: for j, s in enumerate(_v): _v[j] = [x[1:] + x[:1] for x in s] elif k == 'dep': for j, s in enumerate(v): v[j] = [(x[0] - 1, x[1]) for x in s] for i, input in zip(input_ids, group_inputs): for k, v in annotations.items(): results[i][k] = v[:len(input.tokens)] if k == 'ner': if not input.verbose: for j, s in enumerate(results[i][k]): results[i][k][j] = [x[:-1] for x in s] elif k == 'srl': if not input.verbose: for _v in v: for j, s in enumerate(_v): _v[j] = [x[:-1] for x in s] del v[:len(input.tokens)] return results
def predict(self, data: Union[str, List[str]], batch_size: int = None, tasks: Optional[Union[str, List[str]]] = None, resolve_dependencies=True, **kwargs) -> Document: doc = Document() if not data: return doc if resolve_dependencies: # Now we decide which tasks to perform and their orders tasks_in_topological_order = self._tasks_in_topological_order task_topological_order = self._task_topological_order computation_graph = self._computation_graph target_tasks = self._resolve_task_name(tasks) if not target_tasks: target_tasks = tasks_in_topological_order else: target_topological_order = defaultdict(set) for task_name in target_tasks: if task_name not in computation_graph: continue for dependency in topological_sort(computation_graph, task_name): target_topological_order[ task_topological_order[dependency]].add(dependency) target_tasks = [ item[1] for item in sorted(target_topological_order.items()) ] else: target_tasks = [set(tasks)] if isinstance(tasks, list) else [{tasks}] if not target_tasks: return Document() # Sort target tasks within the same group in a defined order target_tasks = [ sorted(x, key=lambda _x: self.config.task_names.index(_x)) for x in target_tasks ] flatten_target_tasks = [ self.tasks[t] for group in target_tasks for t in group ] cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks]) sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks]) # Now build the dataloaders and execute tasks first_task_name: str = list(target_tasks[0])[0] first_task: Task = self.tasks[first_task_name] encoder_transform, transform = self.build_transform(first_task) # Override the tokenizer config of the 1st task encoder_transform.sep_is_eos = sep_is_eos encoder_transform.cls_is_bos = cls_is_bos average_subwords = self.model.encoder.average_subwords flat = first_task.input_is_flat(data) if flat: data = [data] device = self.device samples = first_task.build_samples(data, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) dataloader = first_task.build_dataloader(samples, transform=transform, device=device) results = defaultdict(list) order = [] for batch in dataloader: order.extend(batch[IDX]) # Run the first task, let it make the initial batch for the successors output_dict = self.predict_task(first_task, first_task_name, batch, results, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) # Run each task group in order for group_id, group in enumerate(target_tasks): # We could parallelize this in the future for task_name in group: if task_name == first_task_name: continue output_dict = self.predict_task(self.tasks[task_name], task_name, batch, results, output_dict, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) if group_id == 0: # We are kind of hard coding here. If the first task is a tokenizer, # we need to convert the hidden and mask to token level if 'token_token_span' not in batch: spans = [] tokens = [] for span_per_sent, token_per_sent in zip( output_dict[first_task_name]['prediction'], results[first_task_name]): if cls_is_bos: span_per_sent = [(-1, 0)] + span_per_sent token_per_sent = [BOS] + token_per_sent if sep_is_eos: span_per_sent = span_per_sent + [ (span_per_sent[-1][0] + 1, span_per_sent[-1][1] + 1) ] token_per_sent = token_per_sent + [EOS] # The offsets start with 0 while [CLS] is zero if average_subwords: span_per_sent = [ list(range(x[0] + 1, x[1] + 1)) for x in span_per_sent ] else: span_per_sent = [ x[0] + 1 for x in span_per_sent ] spans.append(span_per_sent) tokens.append(token_per_sent) spans = PadSequenceDataLoader.pad_data(spans, 0, torch.long, device=device) output_dict['hidden'] = pick_tensor_for_each_token( output_dict['hidden'], spans, average_subwords) batch['token_token_span'] = spans batch['token'] = tokens # noinspection PyTypeChecker batch['token_length'] = torch.tensor( [len(x) for x in tokens], dtype=torch.long, device=device) batch.pop('mask', None) # Put results into doc in the order of tasks for k in self.config.task_names: v = results.get(k, None) if v is None: continue doc[k] = reorder(v, order) # Allow task to perform finalization on document for group in target_tasks: for task_name in group: task = self.tasks[task_name] task.finalize_document(doc, task_name) # If no tok in doc, use raw input as tok if not any(k.startswith('tok') for k in doc): doc['tok'] = data if flat: for k, v in list(doc.items()): doc[k] = v[0] # If there is only one field, don't bother to wrap it # if len(doc) == 1: # return list(doc.values())[0] return doc