def test_empty(self): with self.assertRaises(ParseException) as assert_context: line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t" parse_line(line, fields=DEFAULT_FIELDS) expected = "Failed parsing field 'id'" self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
def test_parse_line_with_spaces(self): line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _" with self.assertRaises(ParseException) as assert_context: parse_line(line, fields=DEFAULT_FIELDS) expected = "Invalid line format" self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS): for sentence in text.split("\n\n"): if sentence: yield [ parse_line(line, fields) for line in sentence.split("\n") if line and not line.strip().startswith("#") ]
def parse_token_and_metadata(data, fields=None, field_parsers=None): if not data: raise ParseException( "Can't create TokenList, no data sent to constructor.") fields = fields or DEFAULT_FIELDS field_parsers = field_parsers or DEFAULT_FIELD_PARSERS tokens = [] texts = [] for line in data.split('\n'): line = line.strip() if not line: continue if line.startswith('#'): var_name, var_value = parse_comment_line(line) if var_name == "text": texts.append(var_value) else: tokens.append(parse_line(line, fields, field_parsers)) return tokens, texts
def test_parse_line_nullable_fields(self): line = "_\t_\t_\t_\t_\t_\t_\t_\t_\t_" self.assertEqual( parse_line(line, fields=DEFAULT_FIELDS), Token([('id', None), ('form', '_'), ('lemma', '_'), ('upos', '_'), ('xpos', None), ('feats', None), ('head', None), ('deprel', '_'), ('deps', None), ('misc', None)]))
def lazy_parse(text: str, fields: Tuple[str, ...] = DEFAULT_FIELDS): for sentence in text.split("\n\n"): if not sentence: continue annotation = [] features = {} for line in sentence.split("\n"): if line.strip().startswith("#"): if line[:8] == '# prompt': features['prompt'] = line.strip().split(':')[1] else: new_features = line.strip()[1:].split() for new_feature in new_features: name, value = new_feature.split(':') features[name] = value continue index_label, *data = line.strip().split() *data, label = data output = parse_line('\t'.join(data), fields) output.update(unpack_token_index(index_label)) output.update({'label': int(label)}) annotation.append(output) yield annotation, features
def test_parse_line_only_id_head(self): line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_" self.assertEqual(parse_line(line, fields=["id", "form"]), OrderedDict([ ('id', 1), ('form', 'The'), ]))
def test_parse_line_two_spaces(self): line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _" self.assertEqual(parse_line(line, fields=["id", "form"]), OrderedDict([ ('id', 1), ('form', 'The'), ]))
def test_parse_line(self): line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_" self.assertEqual( parse_line(line, fields=DEFAULT_FIELDS), Token([('id', 1), ('form', 'The'), ('lemma', 'the'), ('upos', 'DET'), ('xpos', 'DT'), ('feats', Token([('Definite', 'Def'), ('PronType', 'Art')])), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', None)]))
def test_parse_line_fewer_columns(self): line = "1\tThe\tthe\tDET\tDT" self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([ ('id', 1), ('form', 'The'), ('lemma', 'the'), ('upos', 'DET'), ('xpos', 'DT'), ]))
def test_parse_line(self): line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_" self.assertEqual( parse_line(line), OrderedDict([('id', 1), ('form', 'The'), ('lemma', 'the'), ('upostag', 'DET'), ('xpostag', 'DT'), ('feats', OrderedDict([('Definite', 'Def'), ('PronType', 'Art')])), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', None)]))
def test_parse_custom_fieldparsers(self): line = "1\t2" custom_fieldparsers = { "id": lambda line, i: line[i] * 5, } self.assertEqual( parse_line(line, fields=["id"], field_parsers=custom_fieldparsers), Token([ ('id', "11111"), ]))
def _lazy_parse(text: str, fields=DEFAULT_FIELDS): """ Reads conllu annotations, yielding unwieldy OrderedDict-like objects per sentence. """ for sentence in text.split("\n\n"): if sentence: yield [ parse_line(line, fields) for line in sentence.split("\n") if line and not line.strip().startswith("#") ]
def test_parse_fieldparsers_alias_two_ways(self): line = "1\t2" custom_fieldparsers = { "xpos": lambda line, i: line[i] * 5, "upostag": lambda line, i: line[i] * 5, } self.assertEqual( parse_line(line, fields=["xpostag", "upos"], field_parsers=custom_fieldparsers), Token([ ('xpostag', "11111"), ('upos', "22222"), ]) )
def test_parse_fieldparsers_doesnt_alias_when_exists(self): line = "1\t2" custom_fieldparsers = { "xpos": lambda line, i: line[i] * 5, "xpostag": lambda line, i: line[i], "upos": lambda line, i: line[i] * 5, "upostag": lambda line, i: line[i], } self.assertEqual( parse_line(line, fields=["xpostag", "upostag"], field_parsers=custom_fieldparsers), Token([ ('xpostag', "1"), ('upostag', "2"), ]) )
def extract_token_info_from_companion_data(self): annotation = [] for line in self.companion: line = '\t'.join(line) annotation.append(parse_line(line, DEFAULT_FIELDS)) tokens = [x["form"] for x in annotation if x["form"] is not None] lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None] pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None] token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation] return {"tokens": tokens, "lemmas": lemmas, "pos_tags": pos_tags, "token_range": token_range}
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS): for sentence in text.split("\n\n"): if sentence: annotations = [ parse_line(line, fields) for line in sentence.split("\n") if line and not line.strip().startswith("#") ] # (child, parent/head) pairs arc_indices = [] # Strings with the relation for each pair arc_labels = [] for idx, annotation in enumerate(annotations): head = annotation["head"] if head == 0: # Skip the root continue # UD marks the head with 1-indexed numbering, so we subtract # one to get the index of the parent. arc_indices.append((idx, head - 1)) arc_labels.append(annotation["deprel"]) yield annotations, arc_indices, arc_labels
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS): for sentence in text.split("\n\n"): if sentence: yield [parse_line(line, fields) for line in sentence.split("\n") if line and not line.strip().startswith("#")]
def test_parse_line_with_no_tabs(self): line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _" with self.assertRaises(ParseException): parse_line(line)
def test_parse_line_with_spaces(self): line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _" with self.assertRaises(ParseException): parse_line(line, fields=DEFAULT_FIELDS)
def test_empty(self): with self.assertRaises(ParseException): line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t" parse_line(line, fields=DEFAULT_FIELDS)
parser.add_argument("conll", type=str, help="Augment CoNLL file") parser.add_argument("mrp", type=str, help="Input MRP file") parser.add_argument("output", type=str, help="Output Augmented file") args = parser.parse_args() conll_file = args.conll mrp_file = args.mrp out_file = args.output augs = {} with open(conll_file, 'r', encoding='utf8') as f_c: conlls = f_c.read().split('\n\n') for conll in conlls: id = conll.split('\n')[0][1:] augs[id] = [ parse_line(line, DEFAULT_FIELDS) for line in conll.strip().split('\n')[1:] ] #print augs.keys() with open(mrp_file, 'r', encoding='utf8') as f_m, open(out_file, 'w', encoding='utf8') as fo: line = f_m.readline() while line: mrp = json.loads(line, object_pairs_hook=collections.OrderedDict) id = mrp['id'] if id not in augs: print("id:{} not in companion".format(id)) else: mrp['companion'] = dict(sent_id=id, toks=augs[id]) fo.write((json.dumps(mrp) + '\n'))