コード例 #1
0
ファイル: train.py プロジェクト: nordquists/truecaser
    def __load_sentences(self):
        tokenized_sentences = []
        capitalization = defaultdict(partial(set))

        parser = Parser()

        type, sentence = parser.next()

        while sentence:
            if type == 'tweet':
                tokens = twokenize.tokenize(sentence)
            else:
                tokens = word_tokenize(sentence)

            self.__track_capitalization(capitalization, tokens)
            tokenized_sentences.append(tokens)

            type, sentence = parser.next()

        return capitalization, tokenized_sentences