Python BList Beispiele, pyknp.BList Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ne.py Projekt: shirayu/kyoto-reader

 def __init__(self,
              category: str,
              midasi: str,
              sentence: BList,
              mid_range: range,
              mrph2dmid: Dict[Morpheme, int]):
     self.category: str = category
     self.midasi: str = midasi
     self.sid: str = sentence.sid
     self.mid_range: range = mid_range
     dmid_start = mrph2dmid[sentence.mrph_list()[mid_range[0]]]
     dmid_end = mrph2dmid[sentence.mrph_list()[mid_range[-1]]]
     self.dmid_range: range = range(dmid_start, dmid_end + 1)

Beispiel #2

0

Datei anzeigen

Datei: preprocess.py Projekt: nobu-g/cohesion-analysis

def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int,
             tokenizer: BertTokenizer):
    """
    各文書を，tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する．
    1文に分割しても max_subword_length を超えるような長い文はそのまま出力する
    """
    did2sids: Dict[str, List[str]] = defaultdict(list)
    did2cumlens: Dict[str, List[int]] = {}
    sid2knp: Dict[str, str] = {}

    for knp_file in input_dir.glob('*.knp'):
        with knp_file.open() as fin:
            did = knp_file.stem
            did2cumlens[did] = [0]
            buff = ''
            for line in fin:
                buff += line
                if line.strip() == 'EOS':
                    blist = BList(buff)
                    did2sids[did].append(blist.sid)
                    did2cumlens[did].append(did2cumlens[did][-1] + len(
                        tokenizer.tokenize(' '.join(
                            m.midasi for m in blist.mrph_list()))))
                    sid2knp[blist.sid] = buff
                    buff = ''

    for did, sids in did2sids.items():
        cum: List[int] = did2cumlens[did]
        end = 1
        # end を探索
        while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length:
            end += 1

        idx = 0
        while end < len(sids) + 1:
            start = 0
            # start を探索
            while cum[end] - cum[start] > max_subword_length:
                start += 1
                if start == end - 1:
                    break
            with output_dir.joinpath(f'{did}-{idx:02}.knp').open(
                    mode='w') as fout:
                fout.write(''.join(
                    sid2knp[sid]
                    for sid in sids[start:end]))  # start から end まで書き出し
            idx += 1
            end += 1

Beispiel #3

0

Datei anzeigen

    def parse(self, sentence):
        """
        文字列を入力として構文解析を行い、文節列オブジェクトを返す

        Args:
            sentence (str): 文を表す文字列

        Returns:
            BList: 文節列オブジェクト
        """
        assert(isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(
                    self.server, self.port, "RUN -tab -normal\n")
            else:
                command = [self.command] + self.option
                if self.rcfile:
                    command.extend(['-r', self.rcfile])
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern))
        else:
            knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern))
        return BList(knp_lines, self.pattern)

Beispiel #4

0

Datei anzeigen

Datei: knp.py Projekt: matsurih/pyknp

    def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す

        Args:
            juman_str (str): ある文に関するJUMANの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(self.server, self.port,
                                     "RUN -tab -normal\n")
            else:
                command = [self.command] + self.options
                if self.rcfile:
                    command.extend(['-r', self.rcfile])
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str,
                                          pattern=r'^%s$' % self.pattern)
        else:
            knp_lines = self.subprocess.query(juman_str,
                                              pattern=r'^%s$' % self.pattern)
        return BList(knp_lines, self.pattern, juman_format)

Beispiel #5

0

Datei anzeigen

 def load_knp_result(self, knp_lines: str) -> BList:
     if knp_lines.strip().endswith("EOS"):
         blist = BList(knp_lines.strip(), self.knp.pattern)
         self.__register_attributes(blist)
         return blist
     else:
         raise NoEOSError

Beispiel #6

0

Datei anzeigen

 def __knp_parse(self, juman_str: str) -> BList:
     if self.knp.socket:
         knp_lines = self.knp.socket.query(juman_str,
                                           pattern=r"^%s$" %
                                           self.knp.pattern)
     else:
         knp_lines = self.knp.subprocess.query(juman_str,
                                               pattern=r"^%s$" %
                                               self.knp.pattern)
     return BList(knp_lines, self.knp.pattern)

Beispiel #7

0

Datei anzeigen

    def result(self, input_str):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern)

Beispiel #8

0

Datei anzeigen

Datei: knp.py Projekt: matsurih/pyknp

    def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        ある文に関するKNP解析結果を文節列オブジェクトに変換する

        Args:
            input_str (str): ある文に関するKNPの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """
        return BList(input_str, self.pattern, juman_format)

Beispiel #9

0

Datei anzeigen

 def __call__(self, document: "Document", blist: BList) -> Sentence:
     sentence = Sentence(document, blist.sid, Builder.ssid, blist)
     start: Optional[Tag] = None
     end: Optional[Tag] = None
     head: Optional[Tag] = None
     for tag in blist.tag_list():
         if not start:
             start = tag
         if not head and "節-主辞" in tag.features:
             head = tag
         if not end and "節-区切" in tag.features:
             end = tag
             if head:
                 EventBuilder()(sentence, start, head, end)
             start, end, head = None, None, None
     document.sentences.append(sentence)
     Builder.ssid += 1
     for bid, bnst in enumerate(blist.bnst_list()):
         for tag in bnst.tag_list():
             Builder.stid_bid_map[(sentence.ssid, tag.tag_id)] = bid
             Builder.stid_tag_map[(sentence.ssid, tag.tag_id)] = tag
     return sentence

Beispiel #10

0

Datei anzeigen

Datei: knp.py Projekt: ku-nlp/pyknp

    def parse_juman_result(self, juman_str, juman_format=JUMAN_FORMAT.DEFAULT):
        """
        JUMAN出力結果に対して構文解析を行い、文節列オブジェクトを返す

        Args:
            juman_str (str): ある文に関するJUMANの出力結果
            juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

        Returns:
            BList: 文節列オブジェクト
        """

        knp_lines = self.analyzer.query(juman_str,
                                        pattern=r'^%s$' % self.pattern)
        return BList(knp_lines, self.pattern, juman_format)

Beispiel #11

0

Datei anzeigen

    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): KNP format string of this sentence.
            dtid_offset (int): The document-wide tag ID of the previous base phrase.
            dmid_offset (int): The document-wide morpheme ID of the previous morpheme.
            doc_id(str): The document ID of this sentence.
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

Beispiel #12

0

Datei anzeigen

Datei: sentence.py Projekt: kevin3314/kyoto-reader

    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): 1文についてのKNPのtab出力
            dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID
            dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID
            doc_id(str): 文書ID
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

Beispiel #13

0

Datei anzeigen

Datei: json2event.py Projekt: NLPforCOVID-19/preprocess-for-causalitygraph

def generate_event_pairs_and_svg_from_json(args):
    json_file = args.json_file
    target_sub_category = args.target_sub_category
    target_relations = args.target_relation
    svg_dir = args.svg_dir
    svg_detail_dir = args.svg_detail_dir

    evgviz = EventGraphVisualizer()

    all_event_pairs = []
    with lzma.open(json_file, mode='rt') as f:
        for line in f:
            json_obj = json.loads(line.strip())
            sub_category = json_obj['sub_category']
            fuman_id = json_obj['id']
            fuman_split_knp = json_obj['fuman_split_knp']
            if target_sub_category is None or sub_category == target_sub_category:
                evg = EventGraph.build(
                    [BList(''.join(knp_list)) for knp_list in fuman_split_knp])
                # extract and print event pairs
                event_pairs = extract_event_pairs(evg, fuman_id,
                                                  target_relations)
                all_event_pairs.extend(event_pairs)
                # output an SVG if svg_dir is specified
                if event_pairs and fuman_id not in error_ids:
                    if svg_dir:
                        svg_filename = os.path.join(svg_dir, fuman_id + '.svg')
                        if not os.path.exists(svg_filename):
                            print("generating {}".format(svg_filename),
                                  file=sys.stderr)
                            try:
                                evgviz.make_image(evg,
                                                  svg_filename,
                                                  with_detail=False,
                                                  with_original_text=False)
                                if svg_detail_dir:
                                    svg_detail_filename = os.path.join(
                                        svg_detail_dir, fuman_id + '.svg')
                                    evgviz.make_image(evg,
                                                      svg_detail_filename,
                                                      with_original_text=False)
                            except subprocess.CalledProcessError as err:
                                print(
                                    "subprocess.CalledProcessError: {}".format(
                                        err),
                                    file=sys.stderr)
    print(json.dumps(all_event_pairs, indent=2, ensure_ascii=False))

Beispiel #14

0

Datei anzeigen

Datei: reader.py Projekt: shirayu/kyoto-reader

    def __init__(
        self,
        knp_string: str,
        doc_id: str,
        cases: List[str],
        corefs: List[str],
        relax_cases: bool,
        extract_nes: bool,
        use_pas_tag: bool,
    ) -> None:
        self.knp_string: str = knp_string
        self.doc_id: str = doc_id
        self.cases: List[str] = cases
        self.corefs: List[str] = corefs
        self.relax_cases: bool = relax_cases
        self.extract_nes: bool = extract_nes
        self.use_pas_tag: bool = use_pas_tag

        self.sid2sentence: Dict[str, BList] = OrderedDict()
        buff = []
        for line in knp_string.strip().split('\n'):
            buff.append(line)
            if line.strip() == 'EOS':
                sentence = BList('\n'.join(buff) + '\n')
                if sentence.sid in self.sid2sentence:
                    logger.warning(f'{sentence.sid:24}duplicated sid found')
                self.sid2sentence[sentence.sid] = sentence
                buff = []

        self.bnst2dbid = {}
        self.tag2dtid = {}
        self.mrph2dmid = {}
        self._assign_document_wide_id()

        self._pas: Dict[int, Pas] = OrderedDict()
        self.mentions: Dict[int, Mention] = OrderedDict()
        self.entities: Dict[int, Entity] = OrderedDict()
        if use_pas_tag:
            self._analyze_pas()
        else:
            self._analyze_rel()

        if extract_nes:
            self.named_entities: List[NamedEntity] = []
            self._extract_nes()

Beispiel #15

0

Datei anzeigen

Datei: split_knp.py Projekt: NLPforCOVID-19/preprocess-for-causalitygraph

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--knp-file', required=True, type=str, help='path to knp file')
    parser.add_argument('--tsv-file', required=True, type=str, help='path to tsv file')
    parser.add_argument('--output-dir', required=True, type=str,
                        help='path to directory where split knp files are exported')
    args = parser.parse_args()

    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True)

    sent2knp = {}
    with open(args.knp_file, mode='rt', errors='ignore') as f:
        buff = ''
        for line in tqdm(f.readlines(), desc='1/3'):
            buff += line
            if line.strip() == 'EOS':
                sent = ''.join(bnst.midasi for bnst in BList(buff).bnst_list())
                sent2knp[sent] = buff
                buff = ''

    did2knp = defaultdict(str)
    ignored_dids = set()
    with open(args.tsv_file, mode='rt', errors='ignore') as f:
        for line in tqdm(f.readlines(), desc='2/3'):
            line = line.strip()
            sid, _, sent = line.split('\t')
            did = '-'.join(sid.split('-')[:-1])
            if sent not in sent2knp:
                ignored_dids.add(did)
                continue
            knp_string = sent2knp[sent]
            assert knp_string.startswith('# ')
            knp_string = knp_string[:2] + f'S-ID:{sid} ' + knp_string[2:]
            did2knp[did] += knp_string

    for did, knp_result in tqdm(did2knp.items(), desc='3/3'):
        if did in ignored_dids:
            continue
        with output_dir.joinpath(f'{did}.knp').open(mode='wt') as f:
            f.write(knp_result)

Beispiel #16

0

Datei anzeigen

Datei: knp.py Projekt: Kazuuuuuki/utech_crawler

    def parse(self, sentence):
        """
        文字列 sentence を対象として構文解析を行い，構文解析結果オブジェクトを返す．
        """
        assert (isinstance(sentence, six.text_type))
        juman_lines = self.juman.juman_lines(sentence)
        juman_str = "%s%s" % (juman_lines, self.pattern)
        if not self.socket and not self.subprocess:
            if self.server is not None:
                self.socket = Socket(self.server, self.port,
                                     "RUN -tab -normal\n")
            else:
                command = "%s %s" % (self.command, self.option)
                if self.rcfile:
                    command += " -r %s" % self.rcfile
                self.subprocess = Subprocess(command)

        if self.socket:
            knp_lines = self.socket.query(juman_str, pattern=self.pattern)
        else:
            knp_lines = self.subprocess.query(juman_str, pattern=self.pattern)
        return BList(knp_lines, self.pattern)

Beispiel #17

0

Datei anzeigen

Datei: convert_conll03_format.py Projekt: smiyawaki0820/allennlp

def run():
    parser = create_arg_parser()
    args = parser.parse_args()

    docs = []
    # データセットに含まれる各文書ファイルを順に読み込む
    for doc_file in sorted(glob.glob(f"{args.di_repo}/*/*", recursive=True)):
        results = []
        buf = ""
        with open(doc_file) as f:
            # 文書に含まれる文とその固有表現ラベルを読み込む
            buf = ""
            for line in f:
                buf += line
                if "EOS" in line:
                    result = BList(buf)
                    add_ne_tag_to_mrphs(result)
                    results.append(result)
                    buf = ""
        docs.append(results)

    # データセットをランダムに並べ替える
    random.shuffle(docs)

    # データセットの分割: 8:1:1
    num_train = int(0.8 * len(docs))
    num_test = int(0.1 * len(docs))
    train_docs = docs[:num_train]
    validation_docs = docs[num_train:-num_test]
    test_docs = docs[-num_test:]

    # データセットをファイルに書き込む
    os.makedirs(args.dest, exist_ok=True)
    write_file(f"{args.dest}/kwdlc_ner_train.txt", train_docs)
    write_file(f"{args.dest}/kwdlc_ner_validation.txt", validation_docs)
    write_file(f"{args.dest}/kwdlc_ner_test.txt", test_docs)

Beispiel #18

0

Datei anzeigen

Datei: sentence.py Projekt: kevin3314/kyoto-reader

class Sentence:
    """ KWDLC(または Kyoto Corpus)の1文書を扱うクラス

    Attributes:
        blist (BList): KNPのBListオブジェクト
        doc_id (str): 文書ID
        bps (List[BasePhrase]): 含まれる基本句のリスト
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): 1文についてのKNPのtab出力
            dtid_offset (int): 文書中でこの文が始まるまでの文書レベル基本句ID
            dmid_offset (int): 文書中でこの文が始まるまでの文書レベル形態素ID
            doc_id(str): 文書ID
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """文ID"""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """形態素とその文書レベルIDを紐付ける辞書"""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """表層表現"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        return self.blist.bnst_list()

    def tag_list(self):
        return self.blist.tag_list()

    def mrph_list(self):
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """含まれる基本句の数"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'

Beispiel #19

0

Datei anzeigen

Datei: knp.py Projekt: Kazuuuuuki/utech_crawler

 def result(self, input_str):
     return BList(input_str, self.pattern)

Beispiel #20

0

Datei anzeigen

Datei: knp.py Projekt: kzinmr/pyknp-extend

 def result(self, input_str):
     return BList(input_str, self.EOS)

Beispiel #21

0

Datei anzeigen

Datei: knp.py Projekt: kzinmr/pyknp-extend

 def knp(self, sentence):
     assert isinstance(sentence, str)
     result = BList(self.parse_sentence(sentence))
     return result

Beispiel #22

0

Datei anzeigen

class Sentence:
    """A class to represent a single sentence.

    Attributes:
        blist (BList): BList object of pyknp.
        doc_id (str): The document ID of this sentence.
        bps (List[BasePhrase]): Base phrases in this sentence.
    """
    def __init__(
        self,
        knp_string: str,
        dtid_offset: int,
        dmid_offset: int,
        doc_id: str,
    ) -> None:
        """

        Args:
            knp_string(str): KNP format string of this sentence.
            dtid_offset (int): The document-wide tag ID of the previous base phrase.
            dmid_offset (int): The document-wide morpheme ID of the previous morpheme.
            doc_id(str): The document ID of this sentence.
        """

        self.blist = BList(knp_string)
        self.doc_id: str = doc_id

        self.bps: List[BasePhrase] = []
        dtid = dtid_offset
        dmid = dmid_offset
        for tag in self.blist.tag_list():
            base_phrase = BasePhrase(tag, dmid, dtid, self.blist.sid, doc_id)
            self.bps.append(base_phrase)
            dtid += 1
            dmid += len(base_phrase)

        self._mrph2dmid: Dict[Morpheme, int] = dict(
            ChainMap(*(bp.mrph2dmid for bp in self.bps)))

        for bp in self.bps:
            if bp.tag.parent_id >= 0:
                bp.parent = self.bps[bp.tag.parent_id]
            for child in bp.tag.children:
                bp.children.append(self.bps[child.tag_id])

    @property
    def sid(self) -> str:
        """A sentence ID."""
        return self.blist.sid

    @property
    def dtids(self) -> List[int]:
        """A document-wide tag ID."""
        return [bp.dtid for bp in self.bps]

    @property
    def mrph2dmid(self) -> Dict[Morpheme, int]:
        """A mapping from morpheme to its document-wide ID."""
        return self._mrph2dmid

    @property
    def surf(self) -> str:
        """A surface expression"""
        return ''.join(bp.surf for bp in self.bps)

    def bnst_list(self):
        """Return list of Bunsetsu object in pyknp."""
        return self.blist.bnst_list()

    def tag_list(self):
        """Return list of Tag object in pyknp."""
        return self.blist.tag_list()

    def mrph_list(self):
        """Return list of Morpheme object in pyknp."""
        return self.blist.mrph_list()

    def __len__(self) -> int:
        """Number of base phrases in this sentence"""
        return len(self.bps)

    def __getitem__(self, tid: int) -> Optional[BasePhrase]:
        if 0 <= tid < len(self):
            return self.bps[tid]
        else:
            logger.error(f'base phrase: {tid} out of range')
            return None

    def __iter__(self) -> Iterator[BasePhrase]:
        return iter(self.bps)

    def __eq__(self, other: 'Sentence') -> bool:
        return self.sid == other.sid

    def __str__(self) -> str:
        return self.surf

    def __repr__(self) -> str:
        return f'Sentence(\'{self.surf}\', sid: {self.sid})'

Beispiel #23

0

Datei anzeigen

def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: TokenizeHandlerMeta):
    """
    各文書を，tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する．
    1文に分割しても max_subword_length を超えるような長い文はそのまま出力する
    """
    did2sids: Dict[str, List[str]] = defaultdict(list)
    did2cumlens: Dict[str, List[int]] = {}
    sid2knp: Dict[str, str] = {}

    max_all_tokens_len = 0

    for knp_file in input_dir.glob('*.knp'):
        with knp_file.open() as fin:
            did = knp_file.stem
            did2cumlens[did] = [0]
            buff = ''
            for line in fin:
                buff += line
                if line.strip() == 'EOS':
                    blist = BList(buff)
                    did2sids[did].append(blist.sid)
                    all_tokens, *_ = tokenizer.get_tokenized_tokens(list(m.midasi for m in blist.mrph_list()))
                    max_all_tokens_len = max(max_all_tokens_len, len(all_tokens))
                    did2cumlens[did].append(
                        did2cumlens[did][-1] + len(all_tokens)
                        # did2cumlens[did][-1] + len(tokenizer.tokenize(' '.join(m.midasi for m in blist.mrph_list())))
                    )
                    sid2knp[blist.sid] = buff
                    buff = ''

    print(f"max_tokens_length per sentence -> {max_all_tokens_len}")
    # assert max_all_tokens_len <= max_subword_length
    # if max_all_tokens_len > max_subword_length:
    #     raise ValueError(f"max_tokens_length exceeded max_subword_length\n{max_all_tokens_len}>{max_subword_length}")
    document_divide_unit_list = []
    for did, sids in did2sids.items():
        cum: List[int] = did2cumlens[did]
        end = 1
        # end を探索
        while end < len(sids) and cum[end+1] - cum[0] <= max_subword_length:
            end += 1

        idx = 0
        while end < len(sids) + 1:
            start = 0
            # start を探索
            while cum[end] - cum[start] > max_subword_length:
                start += 1
                if start == end - 1:
                    break
            document_divide_unit_list.append(
                DocumentDivideUnit(did, idx, start, end)
            )
            # with output_dir.joinpath(f'{did}-{idx:02}.knp').open('wt') as fout:
            #     fout.write(''.join(sid2knp[sid] for sid in sids[start:end]))  # start から end まで書き出し
            idx += 1
            end += 1

    _write_partial_document = partial(
        write_partial_document,
        did2sids=did2sids,
        sid2knp=sid2knp,
        output_dir=output_dir
    )
    with Pool() as pool:
        list(pool.imap(_write_partial_document, document_divide_unit_list))