Ejemplo n.º 1
0
def generate_words_tags_from_tsv(tsv_file_path, lower=False, gold=True, max_seq_length=None, sent_delimiter=None,
                                 char_level=False, hard_constraint=False):
    for sent in read_tsv_as_sents(tsv_file_path):
        words = [cells[0] for cells in sent]
        if max_seq_length:
            offset = 0
            # try to split the sequence to make it fit into max_seq_length
            for shorter_words in split_long_sentence_into(words, max_seq_length, sent_delimiter, char_level,
                                                          hard_constraint):
                if gold:
                    shorter_tags = [cells[1] for cells in sent[offset:offset + len(shorter_words)]]
                    offset += len(shorter_words)
                else:
                    shorter_tags = None
                if lower:
                    shorter_words = [word.lower() for word in shorter_words]
                yield shorter_words, shorter_tags
        else:
            if gold:
                try:
                    tags = [cells[1] for cells in sent]
                except:
                    raise ValueError(f'Failed to load {tsv_file_path}: {sent}')
            else:
                tags = None
            if lower:
                words = [word.lower() for word in words]
            yield words, tags
Ejemplo n.º 2
0
def generator_words_tags(tsv_file_path,
                         lower=True,
                         gold=True,
                         max_seq_length=None):
    for sent in read_tsv(tsv_file_path):
        words = [cells[0] for cells in sent]
        if max_seq_length and len(words) > max_seq_length:
            offset = 0
            # try to split the sequence to make it fit into max_seq_length
            for shorter_words in split_long_sentence_into(
                    words, max_seq_length):
                if gold:
                    shorter_tags = [
                        cells[1]
                        for cells in sent[offset:offset + len(shorter_words)]
                    ]
                    offset += len(shorter_words)
                else:
                    shorter_tags = None
                if lower:
                    shorter_words = [word.lower() for word in shorter_words]
                yield shorter_words, shorter_tags
        else:
            if gold:
                tags = [cells[1] for cells in sent]
            else:
                tags = None
            if lower:
                words = [word.lower() for word in words]
            yield words, tags
Ejemplo n.º 3
0
 def test_split_127(self):
     sent = [
         '“', '旧', '货', '”', '不', '仅', '仅', '是', '指', '新', '货', '被', '使',
         '用', '才', '成', '为', '旧', '货', ';', '还', '包', '括', '商', '品', '的',
         '调', '剂', ',', '即', '卖', '出', '旧', '货', '的', '人', '是', '为', '了',
         '买', '入', '新', '货', ',', '买', '入', '旧', '货', '的', '人', '是', '因',
         '为', '符', '合', '自', '己', '的', '需', '要', ',', '不', '管', '新', '旧',
         ';', '有', '的', '商', '店', '还', '包', '括', '一', '些', '高', '档', '的',
         '工', '艺', '品', '、', '古', '董', '、', '字', '画', '、', '家', '具', '等',
         '商', '品', ';', '有', '的', '还', '包', '括', '新', '货', '卖', '不', '出',
         '去', ',', '企', '业', '或', '店', '主', '为', '了', '盘', '活', '资', '金',
         ',', '削', '价', '销', '售', '积', '压', '产', '品', '。'
     ]
     results = list(split_long_sentence_into(sent, 126))
     self.assertListEqual([[
         '“', '旧', '货', '”', '不', '仅', '仅', '是', '指', '新', '货', '被', '使',
         '用', '才', '成', '为', '旧', '货', ';', '还', '包', '括', '商', '品', '的',
         '调', '剂', ',', '即', '卖', '出', '旧', '货', '的', '人', '是', '为', '了',
         '买', '入', '新', '货', ',', '买', '入', '旧', '货', '的', '人', '是', '因',
         '为', '符', '合', '自', '己', '的', '需', '要', ',', '不', '管', '新', '旧',
         ';', '有', '的', '商', '店', '还', '包', '括', '一', '些', '高', '档', '的',
         '工', '艺', '品', '、', '古', '董', '、', '字', '画', '、', '家', '具', '等',
         '商', '品', ';', '有', '的', '还', '包', '括', '新', '货', '卖', '不', '出',
         '去', ',', '企', '业', '或', '店', '主', '为', '了', '盘', '活', '资', '金',
         ','
     ], ['削', '价', '销', '售', '积', '压', '产', '品', '。']], results)
Ejemplo n.º 4
0
    def load_file(self, filepath: str):
        """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated
        by a delimiter (usually space).

        .. highlight:: bash
        .. code-block:: bash

            $ head train.txt
            上海 浦东 开发 与 法制 建设 同步
            新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )

        Args:
            filepath: The path to the corpus.
        """
        f = TimingFileIterator(filepath)
        # longest_sent = 0
        for line in f:
            line = line.rstrip('\n')
            tokens = line.split(self.delimiter)
            if not tokens:
                continue
            if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len:
                # debug = []
                for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter,
                                                            char_level=self.char_level,
                                                            hard_constraint=self.hard_constraint):
                    # debug.extend(short_sents)
                    # longest_sent = max(longest_sent, len(''.join(short_sents)))
                    yield {'token': short_sents}
                # assert debug == tokens
            else:
                # longest_sent = max(longest_sent, len(''.join(tokens)))
                yield {'token': tokens}
            f.log(line[:20])
        f.erase()
Ejemplo n.º 5
0
Archivo: tsv.py Proyecto: lei1993/HanLP
    def load_file(self, filepath):
        """Load a ``.tsv`` file. A ``.tsv`` file for tagging is defined as a tab separated text file, where non-empty
        lines have two columns for token and tag respectively, empty lines mark the end of sentences.

        Args:
            filepath: Path to a ``.tsv`` tagging file.

        .. highlight:: bash
        .. code-block:: bash

            $ head eng.train.tsv
            -DOCSTART-      O

            EU      S-ORG
            rejects O
            German  S-MISC
            call    O
            to      O
            boycott O
            British S-MISC
            lamb    O

        """
        filepath = get_resource(filepath)
        # idx = 0
        for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
            # idx += 1
            # if idx % 1000 == 0:
            #     print(f'\rRead instances {idx // 1000}k', end='')
            if self.max_seq_len:
                start = 0
                for short_sents in split_long_sentence_into(
                        words,
                        self.max_seq_len,
                        self.sent_delimiter,
                        char_level=self.char_level,
                        hard_constraint=self.hard_constraint):
                    end = start + len(short_sents)
                    yield {'token': short_sents, 'tag': tags[start:end]}
                    start = end
            else:
                yield {'token': words, 'tag': tags}
Ejemplo n.º 6
0
 def test_split_long_sentence_into(self):
     sent = ['a', 'b', 'c', ',', 'd', 'e', ',', 'f', 'g', ',', 'h']
     self.assertListEqual(
         [['a', 'b', 'c', ','], ['d', 'e', ','], ['f', 'g', ','], ['h']],
         list(split_long_sentence_into(sent, 2)))