Python TimingFileIterator.logの例、hanlp.utils.io_util.TimingFileIterator.log Pythonの例

コード例 #1

0

ファイルを表示

def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]:
    realpath = get_resource(path)
    binpath = replace_ext(realpath, '.pkl')
    if cache:
        try:
            flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]')
            word2vec, dim = load_pickle(binpath)
            flash('')
            return word2vec, dim
        except IOError:
            pass

    dim = None
    word2vec = dict()
    f = TimingFileIterator(realpath)
    for idx, line in enumerate(f):
        f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]')
        line = line.rstrip().split(delimiter)
        if len(line) > 2:
            if dim is None:
                dim = len(line)
            else:
                if len(line) != dim:
                    logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim))
                    continue
            word, vec = line[0], line[1:]
            word2vec[word] = np.array(vec, dtype=np.float32)
    dim -= 1
    if cache:
        flash('Caching word2vec [blink][yellow]...[/yellow][/blink]')
        save_pickle((word2vec, dim), binpath)
        flash('')
    return word2vec, dim

コード例 #2

0

ファイルを表示

    def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
        """An iterator over CONLL formatted files which yields documents, regardless
        of the number of document annotations in a particular file. This is useful
        for conll data which has been preprocessed, such as the preprocessing which
        takes place for the 2012 CONLL Coreference Resolution task.

        Args:
          file_path: str: 

        Returns:

        """
        open_file = TimingFileIterator(file_path)
        conll_rows = []
        document: List[OntonotesSentence] = []
        for line in open_file:
            open_file.log(f'Loading {os.path.basename(file_path)}')
            line = line.strip()
            if line != "" and not line.startswith("#"):
                # Non-empty line. Collect the annotation.
                conll_rows.append(line)
            else:
                if conll_rows:
                    document.append(self._conll_rows_to_sentence(conll_rows))
                    conll_rows = []
            if line.startswith("#end document"):
                yield document
                document = []
        open_file.erase()
        if document:
            # Collect any stragglers or files which might not
            # have the '#end document' format for the end of the file.
            yield document

コード例 #3

0

ファイルを表示

ファイル: lm_dataset.py プロジェクト: cfy42584125/HanLP-1

 def load_file(self, filepath):
     cache, valid = file_cache(filepath, not self.cache)
     if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)):
         with open(cache, 'wb') as out:
             tokens, lines = 0, 0
             f = TimingFileIterator(filepath)
             for line in f:
                 if self.strip:
                     line = line.strip()
                     if not line:
                         continue
                 sample = {'text': line}
                 sample = self.transform_sample(sample, inplace=True)
                 for id in sample['token_id']:
                     out.write((id).to_bytes(4, 'little'))
                 tokens += len(sample['token_id'])
                 lines += 1
                 f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n'
                       f'{sample["token"][:10]}')
             f.erase()
             if self.vocab.mutable:
                 self.vocab.lock()
                 hanlp_common.io.save_json(self.vocab_path)
             self.num_tokens = tokens
     else:
         self.num_tokens = int(os.path.getsize(self.filecache) / 4)
         if self.vocab.mutable:
             hanlp_common.io.load_json(self.vocab_path)

コード例 #4

0

ファイルを表示

    def load_file(self, filepath: tuple):
        phrase_tree_path = get_resource(filepath[0])
        dep_tree_path = get_resource(filepath[1])
        pf = TimingFileIterator(phrase_tree_path)
        message_prefix = f'Loading {os.path.basename(phrase_tree_path)} and {os.path.basename(dep_tree_path)}'
        for i, (dep_sent, phrase_sent) in enumerate(
                zip(read_tsv_as_sents(dep_tree_path), pf)):
            # Somehow the file contains escaped literals
            phrase_sent = phrase_sent.replace('\\/', '/')

            token = [x[1] for x in dep_sent]
            pos = [x[3] for x in dep_sent]
            head = [int(x[6]) for x in dep_sent]
            rel = [x[7] for x in dep_sent]
            phrase_tree = load_trees_from_str(phrase_sent, [head], [rel],
                                              [token])
            assert len(
                phrase_tree
            ) == 1, f'{phrase_tree_path} must have on tree per line.'
            phrase_tree = phrase_tree[0]

            yield {
                'FORM': token,
                'CPOS': pos,
                'HEAD': head,
                'DEPREL': rel,
                'tree': phrase_tree,
                'hpsg': phrase_tree.convert()
            }
            pf.log(
                f'{message_prefix} {i + 1} samples [blink][yellow]...[/yellow][/blink]'
            )
        pf.erase()

コード例 #5

0

ファイルを表示

    def load_file(self, filepath):
        """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in
        :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively.

        Args:
            filepath: ``.conllx`` or ``.conllu`` file path.
        """
        if filepath.endswith('.conllu'):
            # See https://universaldependencies.org/format.html
            field_names = [
                'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD',
                'DEPREL', 'DEPS', 'MISC'
            ]
        else:
            field_names = [
                'ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD',
                'DEPREL', 'PHEAD', 'PDEPREL'
            ]
        fp = TimingFileIterator(filepath)
        for idx, sent in enumerate(read_conll(fp)):
            sample = {}
            for i, field in enumerate(field_names):
                sample[field] = [cell[i] for cell in sent]
            if not self._prune or not self._prune(sample):
                yield sample
            fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')

コード例 #6

0

ファイルを表示

    def load_file(self, filepath: str):
        """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated
        by a delimiter (usually space).

        .. highlight:: bash
        .. code-block:: bash

            $ head train.txt
            上海 浦东 开发 与 法制 建设 同步
            新华社 上海 二月 十日 电 （ 记者 谢金虎 、 张持坚 ）

        Args:
            filepath: The path to the corpus.
        """
        f = TimingFileIterator(filepath)
        # longest_sent = 0
        for line in f:
            line = line.rstrip('\n')
            tokens = line.split(self.delimiter)
            if not tokens:
                continue
            if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len:
                # debug = []
                for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter,
                                                            char_level=self.char_level,
                                                            hard_constraint=self.hard_constraint):
                    # debug.extend(short_sents)
                    # longest_sent = max(longest_sent, len(''.join(short_sents)))
                    yield {'token': short_sents}
                # assert debug == tokens
            else:
                # longest_sent = max(longest_sent, len(''.join(tokens)))
                yield {'token': tokens}
            f.log(line[:20])
        f.erase()

コード例 #7

0

ファイルを表示

ファイル: conll2012.py プロジェクト: lei1993/HanLP

    def load_file(self, filepath: str):
        """Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` CoNLL12 corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sid, (sentence,
                      srl) in enumerate(zip(doc['sentences'], doc['srl'])):
                if self.doc_level_offset:
                    srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc,
                            x[2] - num_tokens_in_doc, x[3]) for x in srl]
                else:
                    srl = [(x[0], x[1], x[2], x[3]) for x in srl]
                for x in srl:
                    if any([o < 0 for o in x[:3]]):
                        raise ValueError(
                            f'Negative offset occurred, maybe doc_level_offset=False'
                        )
                    if any([o >= len(sentence) for o in x[:3]]):
                        raise ValueError(
                            'Offset exceeds sentence length, maybe doc_level_offset=True'
                        )
                deduplicated_srl = set()
                pa_set = set()
                for p, b, e, l in srl:
                    pa = (p, b, e)
                    if pa in pa_set:
                        continue
                    pa_set.add(pa)
                    deduplicated_srl.add((p, b, e, l))
                yield self.build_sample(sentence, deduplicated_srl, doc, sid)
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]'
            )
        reader.erase()

コード例 #8

0

ファイルを表示

ファイル: eos.py プロジェクト: lei1993/HanLP

    def load_file(self, filepath: str):
        """Load eos corpus.

        Args:
            filepath: Path to the corpus.

        .. highlight:: bash
        .. code-block:: bash

            $ head -n 2 ctb8.txt
            中国经济简讯
            新华社北京十月二十九日电中国经济简讯

        """
        f = TimingFileIterator(filepath)
        sents = []
        eos_offsets = []
        offset = 0
        for line in f:
            if not line.strip():
                continue
            line = line.rstrip('\n')
            eos_offsets.append(offset + len(line.rstrip()) - 1)
            offset += len(line)
            if self.append_after_sentence:
                line += self.append_after_sentence
                offset += len(self.append_after_sentence)
            f.log(line)
            sents.append(line)
        f.erase()
        corpus = list(itertools.chain.from_iterable(sents))

        if self.eos_chars:
            if not isinstance(self.eos_chars, set):
                self.eos_chars = set(self.eos_chars)
        else:
            eos_chars = Counter()
            for i in eos_offsets:
                eos_chars[corpus[i]] += 1
            self.eos_chars = set(k for (k, v) in eos_chars.most_common()
                                 if v >= self.eos_char_min_freq and (
                                     not self.eos_char_is_punct or ispunct(k)))
            cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]')

        eos_index = 0
        eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars]
        window_size = self.window_size
        for i, c in enumerate(corpus):
            if c in self.eos_chars:
                window = corpus[i - window_size:i + window_size + 1]
                label_id = 1. if eos_offsets[eos_index] == i else 0.
                if label_id > 0:
                    eos_index += 1
                yield {'char': window, 'label_id': label_id}
        assert eos_index == len(
            eos_offsets), f'{eos_index} != {len(eos_offsets)}'

コード例 #9

0

ファイルを表示

ファイル: json_ner.py プロジェクト: lei1993/HanLP

    def load_file(self, filepath: str):
        """Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` NER corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            line = line.strip()
            if not line:
                continue
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sentence, ner in zip(doc['sentences'], doc['ner']):
                if self.doc_level_offset:
                    ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc,
                            x[2]) for x in ner]
                else:
                    ner = [(x[0], x[1], x[2]) for x in ner]
                if self.tagset:
                    ner = [x for x in ner if x[2] in self.tagset]
                    if isinstance(self.tagset, dict):
                        ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner]
                deduplicated_srl = []
                be_set = set()
                for b, e, l in ner:
                    be = (b, e)
                    if be in be_set:
                        continue
                    be_set.add(be)
                    deduplicated_srl.append((b, e, l))
                yield {'token': sentence, 'ner': deduplicated_srl}
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]'
            )
        reader.erase()