Example #1
0
def predict_segment(model: BiLSTM, input_mapper, text):
    if len(text) == 0 or text.isspace():
        return []
    with Processor.started_stopwatch('input_mapping'):
        tokens, char_ids, word_ids = input_mapper.transform_text(text)

    if len(char_ids) == 0:
        return []

    all_ids = []
    i = 0
    while i < len(char_ids[0]):
        lim = min(len(char_ids[0]), i + _max_sequence_length)
        if lim - i > 0:
            all_ids.append((char_ids[0:1, i:lim], word_ids[0:1, i:lim]))
        i += _max_sequence_length
    predictions = []
    for char_ids, word_ids in all_ids:
        with Processor.started_stopwatch('model_predict'):
            local_predictions = model.predict(char_ids, word_ids)
        predictions.extend(local_predictions[0])
    start_index = None
    prev_end = None
    for (start, end), prediction in zip(tokens, predictions):
        if prediction == 1:
            if start_index is not None:
                end_punct = _punct.match(text, prev_end)
                if end_punct is not None:
                    prev_end = end_punct.end()
                yield start_index, prev_end
            start_index = start
        prev_end = end
    if start_index is not None and prev_end is not None:
        yield start_index, prev_end
Example #2
0
 def file_to_event(self,
                   f: Union[Path, str, io.IOBase],
                   *,
                   client: Optional[EventsClient] = None) -> Event:
     import pickle
     with Processor.started_stopwatch('io'):
         try:
             d = pickle.load(f)
         except TypeError:
             with Path(f).open('rb') as f:
                 d = pickle.load(f)
     with Processor.started_stopwatch('transform'):
         return dict_to_event(d, client=client)
Example #3
0
 def file_to_event(self,
                   f: Union[Path, str, io.IOBase],
                   client: Optional[EventsClient] = None) -> Event:
     import json
     with Processor.started_stopwatch('io'):
         try:
             d = json.load(f)
         except AttributeError:
             if isinstance(f, str):
                 f = Path(f)
             with f.open('r') as f:
                 d = json.load(f)
     with Processor.started_stopwatch('transform'):
         return dict_to_event(d, client=client)
Example #4
0
 def event_to_file(self,
                   event: Event,
                   f: Union[Path, str, io.IOBase],
                   *,
                   include_label_text: bool = False):
     import pickle
     with Processor.started_stopwatch('transform'):
         d = event_to_dict(event, include_label_text=include_label_text)
     with Processor.started_stopwatch('io'):
         try:
             pickle.dump(d, f)
         except TypeError:
             with Path(f).open('wb') as f:
                 pickle.dump(d, f)
Example #5
0
 def event_to_file(self,
                   event: Event,
                   f: Path,
                   *,
                   include_label_text: bool = False):
     import json
     with Processor.started_stopwatch('transform'):
         d = event_to_dict(event, include_label_text=include_label_text)
     with Processor.started_stopwatch('io'):
         try:
             json.dump(d, f)
         except AttributeError:
             f = Path(f)
             f.parent.mkdir(parents=True, exist_ok=True)
             with f.open('w') as f:
                 json.dump(d, f)
Example #6
0
 def file_to_event(self,
                   f: Union[Path, str, io.IOBase],
                   *,
                   client: Optional[EventsClient] = None) -> Event:
     import yaml
     try:
         from yaml import CLoader as Loader
     except ImportError:
         from yaml import Loader
     with Processor.started_stopwatch('io'):
         if isinstance(f, io.IOBase):
             d = yaml.load(f, Loader=Loader)
         else:
             with Path(f).open() as f:
                 d = yaml.load(f, Loader=Loader)
     with Processor.started_stopwatch('transform'):
         return dict_to_event(d, client=client)
Example #7
0
 def event_to_file(self,
                   event: Event,
                   f: Union[Path, str, io.IOBase],
                   *,
                   include_label_text: bool = False):
     import yaml
     try:
         from yaml import CDumper as Dumper
     except ImportError:
         from yaml import Dumper
     with Processor.started_stopwatch('transform'):
         d = event_to_dict(event, include_label_text=include_label_text)
     with Processor.started_stopwatch('io'):
         if isinstance(f, io.IOBase):
             yaml.dump(d, f, Dumper=Dumper)
         else:
             f = Path(f)
             with f.open('w') as f:
                 yaml.dump(d, f, Dumper=Dumper)
Example #8
0
def predict_text(model: BiLSTM, input_mapper, text):
    prev = 0
    with Processor.started_stopwatch('segment_splitting') as split_timer:
        for match in _split.finditer(text):
            split_timer.stop()
            start = match.start()
            local_text = text[prev:start]
            for ss, se in predict_segment(model, input_mapper, local_text):
                yield prev + ss, prev + se
            prev = match.end()
            split_timer.start()
Example #9
0
 def call_process(self, event_id, params):
     self.processed += 1
     p = dict(self.params)
     if params is not None:
         p.update(params)
     with Processor.enter_context() as c, \
             Event(event_id=event_id, client=self.client) as event:
         try:
             with Processor.started_stopwatch(
                     'process_method') as stopwatch:
                 stopwatch.start()
                 result = self.processor.process(event, p)
             return result, c.times, event.created_indices
         except Exception as e:
             self.failure_count += 1
             logger.error(
                 'Processor "%s" failed while processing event with id: %s',
                 self.component_id, event_id)
             logger.error(e)
             raise e
Example #10
0
    def call_process(self, event_id, params):
        self.processed += 1
        p = dict(self.params or {})
        if params is not None:
            p.update(params)

        with EventProcessor.enter_context() as context:
            try:
                request = processing_pb2.ProcessRequest(
                    processor_id=self._processor_id, event_id=event_id)
                _structs.copy_dict_to_struct(p, request.params, [p])
                with Processor.started_stopwatch('remote_call'):
                    response = self._stub.Process(request)
                r = {}
                _structs.copy_struct_to_dict(response.result, r)

                timing_info = response.timing_info
                for k, v in timing_info.items():
                    context.add_time(k, v.ToTimedelta())

                created_indices = {}
                for created_index in response.created_indices:
                    try:
                        doc_created_indices = created_indices[
                            created_index.document_name]
                    except KeyError:
                        doc_created_indices = []
                        created_indices[
                            created_index.document_name] = doc_created_indices
                    doc_created_indices.append(created_index.index_name)

                return r, context.times, created_indices
            except Exception as e:
                self.failure_count += 1
                logger.error(
                    'Processor "%s" failed while processing event with id: %s',
                    self.component_id, event_id)
                logger.error(e)
                raise e
Example #11
0
def test_preserves_times():
    with Processor.enter_context() as context:
        context.add_time("foo", timedelta(seconds=2))
        context.add_time("foo", timedelta(seconds=2))
        assert context.times["foo"] == timedelta(seconds=4)
Example #12
0
def test_stopwatch_no_fail_outside_context():
    blah = False
    with Processor.started_stopwatch('foo'):
        blah = True
    assert blah