def parse_comment_line(line, metadata_parsers=None): line = line.strip() if line[0] != '#': raise ParseException( "Invalid comment format, comment must start with '#'") key, value = parse_pair_value(line[1:]) if not metadata_parsers: metadata_parsers = DEFAULT_METADATA_PARSERS.copy() else: new_metadata_parsers = DEFAULT_METADATA_PARSERS.copy() new_metadata_parsers.update(metadata_parsers) metadata_parsers = new_metadata_parsers custom_result = None if key in metadata_parsers: custom_result = metadata_parsers[key](key, value) elif "__fallback__" in metadata_parsers: custom_result = metadata_parsers["__fallback__"](key, value) # Allow returning pair instead of list of pairs from metadata parsers if custom_result: if isinstance(custom_result, tuple): key, value = custom_result return [(text(key), value)] return [(text(key), value) for key, value in custom_result] if not key or not value: # Lines without value are invalid by default return [] return [(text(key), value)]
def test_parse_tree(self): sentences = parse_tree(data) self.assertEqual(len(sentences), 1) root = sentences[0] self.assertEqual(text(root), "TokenTree<token={id=5, form=jumps}, children=[...]>") self.assertEqual( root.token, OrderedDict([ ('id', 5), ('form', 'jumps'), ('lemma', 'jump'), ('upostag', 'VERB'), ('xpostag', 'VBZ'), ('feats', OrderedDict([ ("Mood", "Ind"), ("Number", "Sing"), ("Person", "3"), ("Tense", "Pres"), ("VerbForm", "Fin"), ])), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', None) ]) ) self.assertEqual( [text(child) for child in root.children], [ "TokenTree<token={id=4, form=fox}, children=[...]>", "TokenTree<token={id=9, form=dog}, children=[...]>", "TokenTree<token={id=10, form=.}, children=None>", ] ) self.assertEqual( root.metadata["text"], "The quick brown fox jumps over the lazy dog." ) self.assertEqual(root.serialize(), data) self.assertEqual( capture_print(root.print_tree), dedent("""\ (deprel:root) form:jumps lemma:jump upostag:VERB [5] (deprel:nsubj) form:fox lemma:fox upostag:NOUN [4] (deprel:det) form:The lemma:the upostag:DET [1] (deprel:amod) form:quick lemma:quick upostag:ADJ [2] (deprel:amod) form:brown lemma:brown upostag:ADJ [3] (deprel:nmod) form:dog lemma:dog upostag:NOUN [9] (deprel:case) form:over lemma:over upostag:ADP [6] (deprel:det) form:the lemma:the upostag:DET [7] (deprel:amod) form:lazy lemma:lazy upostag:ADJ [8] (deprel:punct) form:. lemma:. upostag:PUNCT [10] """) )
def serialize_field(field): if field is None: return '_' if isinstance(field, OrderedDict): fields = [] for key, value in field.items(): if value is None: value = "_" fields.append('='.join((key, value))) return '|'.join(fields) if isinstance(field, tuple): return "".join([text(item) for item in field]) if isinstance(field, list): if len(field[0]) != 2: raise ParseException( "Can't serialize '{}', invalid format".format(field)) return "|".join( [text(value) + ":" + text(key) for key, value in field]) return "{}".format(field)
def test_parse(self): sentences = parse(data) self.assertEqual(len(sentences), 1) sentence = sentences[0] self.assertEqual( text(sentence), "TokenList<The, quick, brown, fox, jumps, over, the, lazy, dog, .>" ) self.assertEqual( sentence[0], OrderedDict([('id', 1), ('form', 'The'), ('lemma', 'the'), ('upostag', 'DET'), ('xpostag', 'DT'), ('feats', OrderedDict([('Definite', 'Def'), ('PronType', 'Art')])), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', None)])) self.assertEqual( sentence[8], OrderedDict([('id', 9), ('form', 'dog'), ('lemma', 'dog'), ('upostag', 'NOUN'), ('xpostag', 'NN'), ('feats', OrderedDict([('Number', 'Sing')])), ('head', 5), ('deprel', 'nmod'), ('deps', None), ('misc', OrderedDict([("SpaceAfter", "No")]))])) self.assertEqual( [token["form"] for token in sentence], "The quick brown fox jumps over the lazy dog .".split(" ")) self.assertEqual(sentence.metadata["text"], "The quick brown fox jumps over the lazy dog.")
def parse_line(line, fields, field_parsers=None): # Be backwards compatible if people called parse_line without field_parsers before field_parsers = field_parsers or DEFAULT_FIELD_PARSERS line = re.split(r"\t| {2,}", line) if len(line) == 1: raise ParseException( "Invalid line format, line must contain either tabs or two spaces." ) data = OrderedDict() for i, field in enumerate(fields): # Allow parsing CoNNL-U files with fewer columns if i >= len(line): break if field in field_parsers: try: value = field_parsers[field](line, i) except ParseException as e: raise ParseException( "Failed parsing field '{}': ".format(field) + str(e)) else: value = line[i] data[text(field)] = value return data
def test_multiple_sentences(self): data = dedent("""\ 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 dog dog NOUN NN Number=Sing 5 nmod _ SpaceAfter=No 3 . . PUNCT . _ 5 punct _ _ 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 dog dog NOUN NN Number=Sing 5 nmod _ SpaceAfter=No 3 . . PUNCT . _ 5 punct _ _ """) self.assertEqual(text(parse(data)), "[TokenList<The, dog, .>, TokenList<The, dog, .>]")
def parse_line(line, fields, field_parsers=None): # Be backwards compatible if people called parse_line without field_parsers before field_parsers = field_parsers or DEFAULT_FIELD_PARSERS # Support xpostag/upostag as aliases for xpos/upos (both ways) if "xpostag" not in field_parsers and "xpos" in field_parsers: field_parsers["xpostag"] = field_parsers["xpos"] if "xpos" not in field_parsers and "xpostag" in field_parsers: field_parsers["xpos"] = field_parsers["xpostag"] if "upostag" not in field_parsers and "upos" in field_parsers: field_parsers["upostag"] = field_parsers["upos"] if "upos" not in field_parsers and "upostag" in field_parsers: field_parsers["upos"] = field_parsers["upostag"] line = re.split(r"\t| {2,}", line) if len(line) == 1: raise ParseException( "Invalid line format, line must contain either tabs or two spaces." ) data = Token() for i, field in enumerate(fields): # Allow parsing CoNNL-U files with fewer columns if i >= len(line): break if field in field_parsers: try: value = field_parsers[field](line, i) except ParseException as e: raise ParseException( "Failed parsing field '{}': ".format(field) + str(e)) else: value = line[i] data[text(field)] = value return data
def __repr__(self): return 'TokenTree<' + \ 'token={id=' + text(self.token['id']) + ', form=' + self.token['form'] + '}, ' + \ 'children=' + ('[...]' if self.children else 'None') + \ '>'