class Tokenizer:
    def __init__(self) -> None:
        os.environ[
            'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(
                os.environ['HOME'])
        self.client = CoreNLPClient(annotators=['ssplit'])
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer

    def tokenize(self, doc: str) -> List[List[Token]]:
        corenlp_annotation = self.client.annotate(doc)
        sentences = []
        for sentence in corenlp_annotation.sentence:
            text = doc[sentence.characterOffsetBegin:sentence.
                       characterOffsetEnd]
            if self.do_lower_case:
                text = text.lower()
            offset = sentence.characterOffsetBegin
            bert_tokens = self.basic_tokenizer.tokenize(text)
            begin = 0
            tokens = []
            for bert_token in bert_tokens:
                word = bert_token
                begin = text.index(word, begin)
                end = begin + len(word)
                tokens.append(Token(word, begin + offset, end + offset))
                begin = end
            if len(tokens) > 0:
                sentences.append(tokens)
        return sentences
class Tokenizer:
    def __init__(self) -> None:
        os.environ[
            'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(
                os.environ['HOME'])
        self.client = CoreNLPClient()
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer

    def __del__(self) -> None:
        for p in glob.glob('corenlp_server-*.props'):
            if os.path.isfile(p):
                os.remove(p)

    def tokenize(self, doc: str) -> List[Sentence]:
        splitter_annotation \
            = self.client.annotate(doc, annotators=['ssplit'],
                                   properties={'tokenize.options': 'ptb3Escaping=false,invertible=true'})
        end = 0
        sentences = []
        for sentence in splitter_annotation.sentence:
            begin = doc.index(sentence.token[0].originalText, end)
            for token in sentence.token:
                end = doc.index(token.originalText, end) + len(
                    token.originalText)
            text = doc[begin:end]
            sentences.append(Sentence(text, begin, end))
        sentences = self.fix_split(sentences)
        for sentence in sentences:
            text = sentence.text
            if self.do_lower_case:
                text = text.lower()
            bert_tokens = self.basic_tokenizer.tokenize(text)
            end = 0
            tokens = []
            for bert_token in bert_tokens:
                word = bert_token
                begin = text.index(word, end)
                end = begin + len(word)
                tokens.append(
                    Token(word, sentence.begin + begin, sentence.begin + end))
            assert len(tokens) > 0
            sentence.tokens = tokens
        return sentences

    @staticmethod
    def fix_split(sentences: List[Sentence]) -> List[Sentence]:
        result = []
        i = 0
        while i < len(sentences):
            sentence = sentences[i]
            while True:
                next_sentence = sentences[
                    i + 1] if i < len(sentences) - 1 else None
                if '\n\n' in sentence.text:
                    index = sentence.text.index('\n\n')
                    new_sentence = Sentence(sentence.text[:index],
                                            sentence.begin,
                                            sentence.begin + index)
                    result.append(new_sentence)
                    index += re.search(r'[\n\t ]+',
                                       sentence.text[index:]).end()
                    sentence.text = sentence.text[index:]
                    sentence.begin += index
                elif next_sentence is not None and next_sentence.begin == sentence.end:
                    sentence.text += next_sentence.text
                    sentence.end = next_sentence.end
                    i += 1
                else:
                    result.append(sentence)
                    break
            i += 1
        return result