Exemple #1
0
    def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True)

        sentences = []
        fields = [field if field is not None else Field(str(i))
                  for i, field in enumerate(fields)]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info('Discarded sentence longer than max_sent_length:',
                              len(lines), file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append fake columns
                        line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))]))
                        assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split()))
                    lines.append(line)

        return cls(fields, sentences)
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt'

        with open(raw_text_file) as fin:
            for line in tokenizer.format(tokenizer.predict(fin.read())):
                if line and not line.startswith('#'):
                    assert len(line.split('\t')) == 2, line
Exemple #3
0
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        sin = io.StringIO(
            "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo."
        )

        for line in tokenizer.format(tokenizer.predict(sin.read())):
            if line and not line.startswith('#'):
                assert len(line.split('\t')) == 10, line
Exemple #4
0
    def load(cls,
             path,
             fields,
             tokenizer_lang,
             tokenizer_dir,
             verbose=True,
             max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang,
                              dir=tokenizer_dir,
                              verbose=verbose)

        sentences = []
        fields = [
            field if field is not None else Field(str(i))
            for i, field in enumerate(fields)
        ]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info(
                            'Discarded sentence longer than max_sent_length:',
                            len(lines),
                            file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append empty columns
                        line += '\t_' * (len(CoNLL._fields) -
                                         len(line.split('\t')))
                    lines.append(line)

        return cls(fields, sentences)