def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True) sentences = [] fields = [field if field is not None else Field(str(i)) for i, field in enumerate(fields)] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info('Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append fake columns line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))])) assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split())) lines.append(line) return cls(fields, sentences)
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt' with open(raw_text_file) as fin: for line in tokenizer.format(tokenizer.predict(fin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 2, line
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) sin = io.StringIO( "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo." ) for line in tokenizer.format(tokenizer.predict(sin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 10, line
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, verbose=verbose) sentences = [] fields = [ field if field is not None else Field(str(i)) for i, field in enumerate(fields) ] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info( 'Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append empty columns line += '\t_' * (len(CoNLL._fields) - len(line.split('\t'))) lines.append(line) return cls(fields, sentences)