def preprocss(data, tag_look_table, vocabulary_lookup_table): raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_look_table.lookup(i) for i in tags] word_ids = [vocabulary_lookup_table.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') return x, y
def process_one_line(line, logger=sys.stderr): obj = bson.loads(line) # print(obj) text = obj['text'] intent = obj['intent'] id = obj["id"] domain = obj["domain"] seq = Document(text, label=intent, id=id) seq.domain = domain for entity in obj['entities']: start = int(entity['start']) # original index start at 0 end = int(entity['end']) entity = entity['entity'] try: span = Span(start, end, entity) # may raise OffsetSpanCheckError except OffsetSpanCheckError as e: logger.write("{}\tspan init failed: {}\n".format(id, e)) raise CheckFailedError # get value which is not in corpus_item object # span.fill_text(corpus_item['text']) seq.span_set.append(span) encoding = offset_to_biluo(seq) # may raise AssertionError # print(encoding) sentence = SentenceX(word_lines=text, attribute_lines=[encoding], id=seq.id) sentence.meta = {'domain': domain, 'label': intent} return seq, sentence
def preprocss(data, maxlen): raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post" ) # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences( raw_y, maxlen, value=0, padding="post" ) return x, y
def _keras_data_preprocss(self, data: 'List[Sequence]', tag_lookuper, maxlen=None): import tensorflow as tf from tokenizer_tools.tagset.converter.offset_to_biluo import offset_to_biluo raw_x = [] raw_y = [] for intent_data in data: offset_data = intent_data tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = ''.join(words) raw_x.append(word_ids) raw_y.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x) x = get_np_feature(raw_x, maxlen) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') return x, y
def ner_preprocss(data, maxlen, cls_info_len): raw_x_ner = [] raw_y_ner = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x_ner.append(word_ids) raw_y_ner.append(tag_ids) if maxlen is None: maxlen = max(len(s) for s in raw_x_ner) maxlen_mt = maxlen + cls_info_len print(">>> maxlen: {}".format(maxlen)) x_ner = tf.keras.preprocessing.sequence.pad_sequences( raw_x_ner, maxlen, padding="post") # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner, maxlen, value=0, padding="post") y_ner = tf.keras.preprocessing.sequence.pad_sequences(y_ner, maxlen_mt, value=0, padding="pre") return x_ner, y_ner
def preprocss(data, maxlen): raw_x = [] raw_y_ner = [] raw_y_cls = [] for offset_data in data: tags = offset_to_biluo(offset_data) label = offset_data.label words = offset_data.text tag_ids = [ner_tag_lookuper.lookup(i) for i in tags] label_id = cls_tag_lookuper.lookup(label) word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y_ner.append(tag_ids) raw_y_cls.append(label_id) if maxlen is None: maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding="post") # right padding y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner, maxlen, value=0, padding="post") y_cls = np.array(raw_y_cls) y_cls = y_cls[:, np.newaxis] return x, y_ner, y_cls
def parse_fn(offset_data): tags = offset_to_biluo(offset_data) words = offset_data.text assert len(words) == len(tags), "Words and tags lengths don't match" logger.debug((words, len(words)), tags) return (words, len(words)), tags
def parse_fn(offset_data, vocabulary_lookup, tag_lookup): tags = offset_to_biluo(offset_data) words = offset_data.text assert len(words) == len(tags), "Words and tags lengths don't match" words_id = [vocabulary_lookup.lookup(i) for i in words] tags_id = [tag_lookup.lookup(i) for i in tags] return words_id, tags_id
def offset_to_sentence(sequence): encoding = offset_to_biluo(sequence) # may raise AssertionError sentence = SentenceX(word_lines=sequence.text, attribute_lines=[encoding], id=sequence.id) sentence.meta = {'label': sequence.label} sentence.meta.update(sequence.extra_attr) return sentence
def test_offset_to_biluo(): '''seq = Document("王小明在北京的清华大学读书。") seq.span_set.append(Span(0, 3, 'PERSON', '王小明')) seq.span_set.append(Span(4, 6, 'GPE', '北京')) seq.span_set.append(Span(7, 11, 'ORG', '清华大学'))''' seq = ['B-I', 'I-O'] #check_result = seq.check_span_set() #print(check_result) encoding = offset_to_biluo(seq) print(encoding)
def preprocss(data, maxlen=None, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.extra_attr[ config['intent_field']] if config['intent_field'] not in [ "label" ] else getattr(offset_data, config['intent_field']) tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] if not maxlen: maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') intent_np_array = np.array(intent_int_list) intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1) return x, intent_one_hot, y, intent_lookup_table
def preprocss(data, intent_lookup_table=None): raw_x = [] raw_y = [] raw_intent = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text label = offset_data.label tag_ids = [tag_lookuper.lookup(i) for i in tags] word_ids = [vocabulary_lookuper.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) raw_intent.append(label) if not intent_lookup_table: raw_intent_set = list(set(raw_intent)) intent_lookup_table = Lookuper( {v: i for i, v in enumerate(raw_intent_set)}) intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent] maxlen = max(len(s) for s in raw_x) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, maxlen, padding='post') # right padding # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post') return x, numpy.array(intent_int_list), y, intent_lookup_table
def preprocess( data: List[Sequence], tag_lookup_table: Lookuper, vocabulary_look_table: Lookuper, seq_maxlen: Union[None, int] = None, ) -> Tuple[np.ndarray, np.ndarray, int]: raw_x = [] raw_y = [] for offset_data in data: tags = offset_to_biluo(offset_data) words = offset_data.text tag_ids = [tag_lookup_table.lookup(i) for i in tags] word_ids = [vocabulary_look_table.lookup(i) for i in words] raw_x.append(word_ids) raw_y.append(tag_ids) if not seq_maxlen: seq_maxlen = max(len(s) for s in raw_x) print(">>> maxlen: {}".format(seq_maxlen)) x = tf.keras.preprocessing.sequence.pad_sequences( raw_x, seq_maxlen, padding="post") # right seq_maxlen # lef padded with -1. Indeed, any integer works as it will be masked # y_pos = pad_sequences(y_pos, maxlen, value=-1) # y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, seq_maxlen, value=0, padding="post") return x, y, seq_maxlen