Exemple #1
0
def preprocss(data, tag_look_table, vocabulary_lookup_table):
    raw_x = []
    raw_y = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text

        tag_ids = [tag_look_table.lookup(i) for i in tags]
        word_ids = [vocabulary_lookup_table.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)

    maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    return x, y
Exemple #2
0
def process_one_line(line, logger=sys.stderr):
    obj = bson.loads(line)
    # print(obj)
    text = obj['text']
    intent = obj['intent']
    id = obj["id"]
    domain = obj["domain"]
    seq = Document(text, label=intent, id=id)
    seq.domain = domain
    for entity in obj['entities']:
        start = int(entity['start'])  # original index start at 0
        end = int(entity['end'])
        entity = entity['entity']

        try:
            span = Span(start, end, entity)  # may raise OffsetSpanCheckError
        except OffsetSpanCheckError as e:
            logger.write("{}\tspan init failed: {}\n".format(id, e))
            raise CheckFailedError

        # get value which is not in corpus_item object
        # span.fill_text(corpus_item['text'])

        seq.span_set.append(span)

    encoding = offset_to_biluo(seq)  # may raise AssertionError
    # print(encoding)

    sentence = SentenceX(word_lines=text,
                         attribute_lines=[encoding],
                         id=seq.id)
    sentence.meta = {'domain': domain, 'label': intent}

    return seq, sentence
Exemple #3
0
    def preprocss(data, maxlen):
        raw_x = []
        raw_y = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post"
        )  # right padding

        # lef padded with -1. Indeed, any integer works as it will be masked
        # y_pos = pad_sequences(y_pos, maxlen, value=-1)
        # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
        y = tf.keras.preprocessing.sequence.pad_sequences(
            raw_y, maxlen, value=0, padding="post"
        )

        return x, y
    def _keras_data_preprocss(self, data: 'List[Sequence]', tag_lookuper, maxlen=None):
        import tensorflow as tf
        from tokenizer_tools.tagset.converter.offset_to_biluo import offset_to_biluo

        raw_x = []
        raw_y = []

        for intent_data in data:
            offset_data = intent_data
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [tag_lookuper.lookup(i) for i in tags]
            word_ids = ''.join(words)

            raw_x.append(word_ids)
            raw_y.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        x = get_np_feature(raw_x, maxlen)

        y = tf.keras.preprocessing.sequence.pad_sequences(raw_y, maxlen, value=0, padding='post')

        return x, y
    def ner_preprocss(data, maxlen, cls_info_len):
        raw_x_ner = []
        raw_y_ner = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x_ner.append(word_ids)
            raw_y_ner.append(tag_ids)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x_ner)

        maxlen_mt = maxlen + cls_info_len
        print(">>> maxlen: {}".format(maxlen))

        x_ner = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x_ner, maxlen, padding="post")  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner,
                                                              maxlen,
                                                              value=0,
                                                              padding="post")

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(y_ner,
                                                              maxlen_mt,
                                                              value=0,
                                                              padding="pre")

        return x_ner, y_ner
Exemple #6
0
    def preprocss(data, maxlen):
        raw_x = []
        raw_y_ner = []
        raw_y_cls = []

        for offset_data in data:
            tags = offset_to_biluo(offset_data)
            label = offset_data.label
            words = offset_data.text

            tag_ids = [ner_tag_lookuper.lookup(i) for i in tags]
            label_id = cls_tag_lookuper.lookup(label)
            word_ids = [vocabulary_lookuper.lookup(i) for i in words]

            raw_x.append(word_ids)
            raw_y_ner.append(tag_ids)
            raw_y_cls.append(label_id)

        if maxlen is None:
            maxlen = max(len(s) for s in raw_x)

        print(">>> maxlen: {}".format(maxlen))

        x = tf.keras.preprocessing.sequence.pad_sequences(
            raw_x, maxlen, padding="post")  # right padding

        y_ner = tf.keras.preprocessing.sequence.pad_sequences(raw_y_ner,
                                                              maxlen,
                                                              value=0,
                                                              padding="post")

        y_cls = np.array(raw_y_cls)
        y_cls = y_cls[:, np.newaxis]

        return x, y_ner, y_cls
Exemple #7
0
def parse_fn(offset_data):
    tags = offset_to_biluo(offset_data)
    words = offset_data.text
    assert len(words) == len(tags), "Words and tags lengths don't match"

    logger.debug((words, len(words)), tags)

    return (words, len(words)), tags
Exemple #8
0
def parse_fn(offset_data, vocabulary_lookup, tag_lookup):
    tags = offset_to_biluo(offset_data)
    words = offset_data.text
    assert len(words) == len(tags), "Words and tags lengths don't match"

    words_id = [vocabulary_lookup.lookup(i) for i in words]
    tags_id = [tag_lookup.lookup(i) for i in tags]

    return words_id, tags_id
Exemple #9
0
def offset_to_sentence(sequence):
    encoding = offset_to_biluo(sequence)  # may raise AssertionError

    sentence = SentenceX(word_lines=sequence.text,
                         attribute_lines=[encoding],
                         id=sequence.id)
    sentence.meta = {'label': sequence.label}
    sentence.meta.update(sequence.extra_attr)

    return sentence
def test_offset_to_biluo():
    '''seq = Document("王小明在北京的清华大学读书。")
    seq.span_set.append(Span(0, 3, 'PERSON', '王小明'))
    seq.span_set.append(Span(4, 6, 'GPE', '北京'))
    seq.span_set.append(Span(7, 11, 'ORG', '清华大学'))'''

    seq = ['B-I', 'I-O']

    #check_result = seq.check_span_set()
    #print(check_result)

    encoding = offset_to_biluo(seq)
    print(encoding)
Exemple #11
0
def preprocss(data, maxlen=None, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.extra_attr[
            config['intent_field']] if config['intent_field'] not in [
                "label"
            ] else getattr(offset_data, config['intent_field'])

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    if not maxlen:
        maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    intent_np_array = np.array(intent_int_list)
    intent_one_hot = one_hot(intent_np_array, np.max(intent_np_array) + 1)

    return x, intent_one_hot, y, intent_lookup_table
def preprocss(data, intent_lookup_table=None):
    raw_x = []
    raw_y = []
    raw_intent = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text
        label = offset_data.label

        tag_ids = [tag_lookuper.lookup(i) for i in tags]
        word_ids = [vocabulary_lookuper.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)
        raw_intent.append(label)

    if not intent_lookup_table:
        raw_intent_set = list(set(raw_intent))
        intent_lookup_table = Lookuper(
            {v: i
             for i, v in enumerate(raw_intent_set)})

    intent_int_list = [intent_lookup_table.lookup(i) for i in raw_intent]

    maxlen = max(len(s) for s in raw_x)

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, maxlen, padding='post')  # right padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      maxlen,
                                                      value=0,
                                                      padding='post')

    return x, numpy.array(intent_int_list), y, intent_lookup_table
Exemple #13
0
def preprocess(
    data: List[Sequence],
    tag_lookup_table: Lookuper,
    vocabulary_look_table: Lookuper,
    seq_maxlen: Union[None, int] = None,
) -> Tuple[np.ndarray, np.ndarray, int]:
    raw_x = []
    raw_y = []

    for offset_data in data:
        tags = offset_to_biluo(offset_data)
        words = offset_data.text

        tag_ids = [tag_lookup_table.lookup(i) for i in tags]
        word_ids = [vocabulary_look_table.lookup(i) for i in words]

        raw_x.append(word_ids)
        raw_y.append(tag_ids)

    if not seq_maxlen:
        seq_maxlen = max(len(s) for s in raw_x)

    print(">>> maxlen: {}".format(seq_maxlen))

    x = tf.keras.preprocessing.sequence.pad_sequences(
        raw_x, seq_maxlen, padding="post")  # right seq_maxlen

    # lef padded with -1. Indeed, any integer works as it will be masked
    # y_pos = pad_sequences(y_pos, maxlen, value=-1)
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
    y = tf.keras.preprocessing.sequence.pad_sequences(raw_y,
                                                      seq_maxlen,
                                                      value=0,
                                                      padding="post")

    return x, y, seq_maxlen