def test_custom_metadata_parsers(self): data = dedent("""\ # global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC # newdoc id = mf920901-001 # newpar id = mf920901-001-p1 # sent_id = mf920901-001-p1s1A # text = Slovenská ústava: pro i proti # text_en = Slovak constitution: pros and cons """) _, metadata = parse_token_and_metadata(data) self.assertEqual(metadata, Token([ ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"), ("newdoc id", "mf920901-001"), ("newpar id", "mf920901-001-p1"), ("sent_id", "mf920901-001-p1s1A"), ("text", "Slovenská ústava: pro i proti"), ("text_en", "Slovak constitution: pros and cons"), ])) _, metadata = parse_token_and_metadata( data, metadata_parsers={"global.columns": lambda key, value: (key, value.split())} ) self.assertEqual(metadata, Token([ ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]), ("newdoc id", "mf920901-001"), ("newpar id", "mf920901-001-p1"), ("sent_id", "mf920901-001-p1s1A"), ("text", "Slovenská ústava: pro i proti"), ("text_en", "Slovak constitution: pros and cons"), ]))
def test_invalid_metadata(self): data = dedent("""\ # meta = data2 # meta = data # meta # = data """) _, metadata = parse_token_and_metadata(data) self.assertEqual(metadata, OrderedDict([("meta", "data")]))
def test_metadata(self): data = dedent("""\ # data = meta # meta = data 1\tdog """) tokenlist = TokenList(*parse_token_and_metadata(data)) self.assertEqual(serialize(tokenlist), data)
def parse_single(data, fields=None, field_parsers=None): ''' parse single is the function which shall handle single conll sentence files. ''' l = data.read() return [ TokenList(*parse_token_and_metadata( l, fields=fields, field_parsers=field_parsers)) ]
def parse(data, fields=None, field_parsers=None): ''' Here we assume a multiple sentence conll files conll files, however since we are handling single sentence conll files, we shall assume so, unless stated otherwise. For this we shall define a new function parse_single ''' return [ TokenList(*parse_token_and_metadata( sentence, fields=fields, field_parsers=field_parsers)) for sentence in data.read() if sentence ]
def test_custom_fields(self): data = dedent("""\ 1\t1\t1 2\t2\t2 """) tokens, _ = parse_token_and_metadata(data, fields=("id", "id", "id")) self.assertEqual(tokens, [ OrderedDict([("id", 1), ("id", 1), ("id", 1)]), OrderedDict([("id", 2), ("id", 2), ("id", 2)]), ])
def test_fallback_metadata_parser(self): data = dedent("""\ #20191005 """) _, metadata = parse_token_and_metadata( data, metadata_parsers={ "__fallback__": lambda key, value: ("sentence-id", key), }, ) self.assertEqual(metadata, Token([ ("sentence-id", "20191005"), ]))
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None): if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield TokenList( *parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers))
def test_invalid_metadata(self): data = dedent("""\ # meta = data2 # meta = data # newdoc # newpar # meta # = data """) _, metadata = parse_token_and_metadata(data) self.assertEqual(metadata, Token([ ("meta", "data"), ("newdoc", None), ("newpar", None), ]))
def test_default_field_parsers_when_undefined(self): data = dedent("""\ 1\tfrom 2\tparis """) fields = ("id", "form") field_parsers = { # Rely on default 'id' field parser "form": lambda line, i: line[i].upper() } tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers) self.assertEqual(tokens, [ Token([("id", 1), ("form", "FROM")]), Token([("id", 2), ("form", "PARIS")]), ])
def test_newlines_in_sentence(self): data = dedent("""\ # meta = data 1\thej 2\tdå 3\thej 4\tdå """) tokens, metadata = parse_token_and_metadata(data) self.assertListEqual(tokens, [ OrderedDict([("id", 1), ("form", "hej")]), OrderedDict([("id", 2), ("form", "då")]), OrderedDict([("id", 3), ("form", "hej")]), OrderedDict([("id", 4), ("form", "då")]), ]) self.assertEqual(metadata, OrderedDict([("meta", "data")]))
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None): if not hasattr(in_file, 'read'): raise FileNotFoundError( "Invalid file, 'parse_incr' needs an opened file as input") if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield TokenList( *parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers))
def test_one_to_many_custom_metadata_parser(self): data = dedent("""\ #\tid='1'-document_id='36:1047'-span='1' """) _, metadata = parse_token_and_metadata( data, metadata_parsers={ "id": lambda key, value: [ (pair.split("=", 1)[0], pair.split("=", 1)[1].strip("'")) for pair in (key + "=" + value).split("-") ] }, ) self.assertEqual(metadata, Token([ ("id", "1"), ("document_id", "36:1047"), ("span", "1"), ]))
def parse_incr( in_file: T.TextIO, fields: T.Optional[T.Sequence[str]] = None, field_parsers: T.Dict[str, _FieldParserType] = None, metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None ) -> T.Iterator[TokenList]: if not hasattr(in_file, 'read'): raise FileNotFoundError( "Invalid file, 'parse_incr' needs an opened file as input") if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers)
def test_custom_field_parsers(self): data = dedent("""\ 1\tbackwards\tline 2\tparis\tsirap """) fields = ("id", "backwards") # A field parser that takes all remaining field, reverses their letters and joins them def parse_backwards(value): return " ".join([part[::-1] for part in value]) # This overrides the default parsers, so the id is parsed as a string field_parsers = { "id": lambda line, i: line[i], "backwards": lambda line, i: parse_backwards(line[i:len(line)]) } tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers) self.assertEqual(tokens, [ Token([("id", '1'), ("backwards", "sdrawkcab enil")]), Token([("id", '2'), ("backwards", "sirap paris")]), ])
def parse(data, fields=None, CoNLL2009=False): return [ TokenList(*parse_token_and_metadata( sentence, fields=fields, CoNLL2009=CoNLL2009)) for sentence in data.split("\n\n") if sentence ]
def test_serialize_tricky_fields(self): data = dedent("""\ 5\tjumps\tjump\tVERB\tVBZ\tMood=Ind|Number=Sing\t0\troot\t_\tSpaceAfter=No """) tokenlist = TokenList(*parse_token_and_metadata(data)) self.assertEqual(serialize(tokenlist).strip(), data.strip())
def test_identity_unicode(self): data = "5\tlängtar\n\n" tokenlist = TokenList(*parse_token_and_metadata(data)) self.assertEqual(serialize(tokenlist), data)
def test_empty(self): with self.assertRaises(ParseException): parse_token_and_metadata(None)
def parse(data, fields=None): return [ TokenList(*parse_token_and_metadata(sentence, fields=fields)) for sentence in data.split("\n\n") if sentence ]
def parse_incr(in_file, fields=None, field_parsers=None): for sentence in _iter_sents(in_file): yield TokenList(*parse_token_and_metadata( sentence, fields=fields, field_parsers=field_parsers))