Ejemplo n.º 1
0
 def load_file(self, filepath):
     cache, valid = file_cache(filepath, not self.cache)
     if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)):
         with open(cache, 'wb') as out:
             tokens, lines = 0, 0
             f = TimingFileIterator(filepath)
             for line in f:
                 if self.strip:
                     line = line.strip()
                     if not line:
                         continue
                 sample = {'text': line}
                 sample = self.transform_sample(sample, inplace=True)
                 for id in sample['token_id']:
                     out.write((id).to_bytes(4, 'little'))
                 tokens += len(sample['token_id'])
                 lines += 1
                 f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n'
                       f'{sample["token"][:10]}')
             f.erase()
             if self.vocab.mutable:
                 self.vocab.lock()
                 hanlp_common.io.save_json(self.vocab_path)
             self.num_tokens = tokens
     else:
         self.num_tokens = int(os.path.getsize(self.filecache) / 4)
         if self.vocab.mutable:
             hanlp_common.io.load_json(self.vocab_path)
Ejemplo n.º 2
0
    def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
        """An iterator over CONLL formatted files which yields documents, regardless
        of the number of document annotations in a particular file. This is useful
        for conll data which has been preprocessed, such as the preprocessing which
        takes place for the 2012 CONLL Coreference Resolution task.

        Args:
          file_path: str: 

        Returns:

        """
        open_file = TimingFileIterator(file_path)
        conll_rows = []
        document: List[OntonotesSentence] = []
        for line in open_file:
            open_file.log(f'Loading {os.path.basename(file_path)}')
            line = line.strip()
            if line != "" and not line.startswith("#"):
                # Non-empty line. Collect the annotation.
                conll_rows.append(line)
            else:
                if conll_rows:
                    document.append(self._conll_rows_to_sentence(conll_rows))
                    conll_rows = []
            if line.startswith("#end document"):
                yield document
                document = []
        open_file.erase()
        if document:
            # Collect any stragglers or files which might not
            # have the '#end document' format for the end of the file.
            yield document
Ejemplo n.º 3
0
    def load_file(self, filepath: str):
        """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated
        by a delimiter (usually space).

        .. highlight:: bash
        .. code-block:: bash

            $ head train.txt
            上海 浦东 开发 与 法制 建设 同步
            新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )

        Args:
            filepath: The path to the corpus.
        """
        f = TimingFileIterator(filepath)
        # longest_sent = 0
        for line in f:
            line = line.rstrip('\n')
            tokens = line.split(self.delimiter)
            if not tokens:
                continue
            if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len:
                # debug = []
                for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter,
                                                            char_level=self.char_level,
                                                            hard_constraint=self.hard_constraint):
                    # debug.extend(short_sents)
                    # longest_sent = max(longest_sent, len(''.join(short_sents)))
                    yield {'token': short_sents}
                # assert debug == tokens
            else:
                # longest_sent = max(longest_sent, len(''.join(tokens)))
                yield {'token': tokens}
            f.log(line[:20])
        f.erase()
Ejemplo n.º 4
0
    def load_file(self, filepath: str):
        """Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` CoNLL12 corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sid, (sentence,
                      srl) in enumerate(zip(doc['sentences'], doc['srl'])):
                if self.doc_level_offset:
                    srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc,
                            x[2] - num_tokens_in_doc, x[3]) for x in srl]
                else:
                    srl = [(x[0], x[1], x[2], x[3]) for x in srl]
                for x in srl:
                    if any([o < 0 for o in x[:3]]):
                        raise ValueError(
                            f'Negative offset occurred, maybe doc_level_offset=False'
                        )
                    if any([o >= len(sentence) for o in x[:3]]):
                        raise ValueError(
                            'Offset exceeds sentence length, maybe doc_level_offset=True'
                        )
                deduplicated_srl = set()
                pa_set = set()
                for p, b, e, l in srl:
                    pa = (p, b, e)
                    if pa in pa_set:
                        continue
                    pa_set.add(pa)
                    deduplicated_srl.add((p, b, e, l))
                yield self.build_sample(sentence, deduplicated_srl, doc, sid)
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]'
            )
        reader.erase()
Ejemplo n.º 5
0
Archivo: eos.py Proyecto: lei1993/HanLP
    def load_file(self, filepath: str):
        """Load eos corpus.

        Args:
            filepath: Path to the corpus.

        .. highlight:: bash
        .. code-block:: bash

            $ head -n 2 ctb8.txt
            中国经济简讯
            新华社北京十月二十九日电中国经济简讯

        """
        f = TimingFileIterator(filepath)
        sents = []
        eos_offsets = []
        offset = 0
        for line in f:
            if not line.strip():
                continue
            line = line.rstrip('\n')
            eos_offsets.append(offset + len(line.rstrip()) - 1)
            offset += len(line)
            if self.append_after_sentence:
                line += self.append_after_sentence
                offset += len(self.append_after_sentence)
            f.log(line)
            sents.append(line)
        f.erase()
        corpus = list(itertools.chain.from_iterable(sents))

        if self.eos_chars:
            if not isinstance(self.eos_chars, set):
                self.eos_chars = set(self.eos_chars)
        else:
            eos_chars = Counter()
            for i in eos_offsets:
                eos_chars[corpus[i]] += 1
            self.eos_chars = set(k for (k, v) in eos_chars.most_common()
                                 if v >= self.eos_char_min_freq and (
                                     not self.eos_char_is_punct or ispunct(k)))
            cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]')

        eos_index = 0
        eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars]
        window_size = self.window_size
        for i, c in enumerate(corpus):
            if c in self.eos_chars:
                window = corpus[i - window_size:i + window_size + 1]
                label_id = 1. if eos_offsets[eos_index] == i else 0.
                if label_id > 0:
                    eos_index += 1
                yield {'char': window, 'label_id': label_id}
        assert eos_index == len(
            eos_offsets), f'{eos_index} != {len(eos_offsets)}'
Ejemplo n.º 6
0
    def load_file(self, filepath: str):
        """Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` NER corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            line = line.strip()
            if not line:
                continue
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sentence, ner in zip(doc['sentences'], doc['ner']):
                if self.doc_level_offset:
                    ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc,
                            x[2]) for x in ner]
                else:
                    ner = [(x[0], x[1], x[2]) for x in ner]
                if self.tagset:
                    ner = [x for x in ner if x[2] in self.tagset]
                    if isinstance(self.tagset, dict):
                        ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner]
                deduplicated_srl = []
                be_set = set()
                for b, e, l in ner:
                    be = (b, e)
                    if be in be_set:
                        continue
                    be_set.add(be)
                    deduplicated_srl.append((b, e, l))
                yield {'token': sentence, 'ner': deduplicated_srl}
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]'
            )
        reader.erase()