def head_to_token(sentence: 'TokenList') -> T.Dict[int, T.List[Token]]: if not sentence: raise ParseException( "Can't parse tree, need a tokenlist as input.") if "head" not in sentence[0]: raise ParseException("Can't parse tree, missing 'head' field.") head_indexed = defaultdict(list) for token in sentence: # Filter out range and decimal ID:s before building tree if "id" in token and not isinstance(token["id"], int): continue # Filter out tokens with negative head, they are sometimes used to # specify tokens which should not be included in tree # Also filter out those that have no head, just exclude them from the tree. if (token.get("head") is None) or token["head"] < 0: continue head_indexed[token["head"]].append(token) if len(head_indexed[0]) == 0: raise ParseException("Found no head node, can't build tree") return head_indexed
def print_tree( self, depth: int = 0, indent: int = 4, exclude_fields: T.Sequence[str] = DEFAULT_EXCLUDE_FIELDS) -> None: if not self.token: raise ParseException("Can't print, token is None.") if "deprel" not in self.token or "id" not in self.token: raise ParseException( "Can't print, token is missing either the id or deprel fields." ) relevant_data = self.token.copy() for key in exclude_fields: if key in relevant_data: del relevant_data[key] node_repr = ' '.join([ '{key}:{value}'.format(key=key, value=value) for key, value in relevant_data.items() ]) print(' ' * indent * depth + '(deprel:{deprel}) {node_repr} [{idx}]'.format( deprel=self.token['deprel'], node_repr=node_repr, idx=self.token['id'], )) for child in self.children: child.print_tree(depth=depth + 1, indent=indent, exclude_fields=exclude_fields)
def head_to_token(sentence): if not sentence: raise ParseException( "Can't parse tree, need a tokenlist as input.") if "head" not in sentence[0]: raise ParseException("Can't parse tree, missing 'head' field.") head_indexed = defaultdict(list) for token in sentence: # Filter out range and decimal ID:s before building tree if "id" in token and not isinstance(token["id"], int): continue # Filter out tokens with negative head, they are sometimes used to # specify tokens which should not be included in tree if token["head"] < 0: continue head_indexed[token["head"]].append(token) if len(head_indexed[0]) == 0: raise ParseException("Found no head node, can't build tree") if len(head_indexed[0]) > 1: raise ParseException( "Can't parse tree, found multiple root nodes.") return head_indexed
def parse_token_and_metadata(data, fields=None, field_parsers=None, metadata_parsers=None): if not data: raise ParseException( "Can't create TokenList, no data sent to constructor.") fields = fields or DEFAULT_FIELDS if not field_parsers: field_parsers = DEFAULT_FIELD_PARSERS.copy() elif sorted(field_parsers.keys()) != sorted(fields): new_field_parsers = DEFAULT_FIELD_PARSERS.copy() new_field_parsers.update(field_parsers) field_parsers = new_field_parsers tokens = [] metadata = Metadata() for line in data.split('\n'): line = line.strip() if not line: continue if line.startswith('#'): pairs = parse_comment_line(line, metadata_parsers=metadata_parsers) for key, value in pairs: metadata[key] = value else: tokens.append(parse_line(line, fields, field_parsers)) return tokens, metadata
def parse_comment_line(line, metadata_parsers=None): line = line.strip() if line[0] != '#': raise ParseException( "Invalid comment format, comment must start with '#'") key, value = parse_pair_value(line[1:]) if not metadata_parsers: metadata_parsers = DEFAULT_METADATA_PARSERS.copy() else: new_metadata_parsers = DEFAULT_METADATA_PARSERS.copy() new_metadata_parsers.update(metadata_parsers) metadata_parsers = new_metadata_parsers custom_result = None if key in metadata_parsers: custom_result = metadata_parsers[key](key, value) elif "__fallback__" in metadata_parsers: custom_result = metadata_parsers["__fallback__"](key, value) # Allow returning pair instead of list of pairs from metadata parsers if custom_result: if isinstance(custom_result, tuple): key, value = custom_result return [(text(key), value)] return [(text(key), value) for key, value in custom_result] if not key or not value: # Lines without value are invalid by default return [] return [(text(key), value)]
def serialize_field(field: T.Any) -> str: if field is None: return '_' if isinstance(field, dict): fields = [] for key, value in field.items(): if value is None: value = "_" fields.append('='.join((key, value))) return '|'.join(fields) if isinstance(field, tuple): return "".join([serialize_field(item) for item in field]) if isinstance(field, list): if len(field[0]) != 2: raise ParseException( "Can't serialize '{}', invalid format".format(field)) return "|".join( [serialize_field(value) + ":" + str(key) for key, value in field]) return "{}".format(field)
def __init__(self, tokens: T.Iterable[Token], metadata: Metadata = None): super(TokenList, self).__init__(tokens) if not isinstance(tokens, list): raise ParseException( "Can't create TokenList, tokens is not a list.") self.metadata = metadata or Metadata()
def parse_line( line: str, fields: T.Sequence[str], field_parsers: T.Optional[T.Dict[str, _FieldParserType]] = None) -> Token: # Be backwards compatible if people called parse_line without field_parsers before field_parsers = field_parsers or DEFAULT_FIELD_PARSERS # Support xpostag/upostag as aliases for xpos/upos (both ways) if "xpostag" not in field_parsers and "xpos" in field_parsers: field_parsers["xpostag"] = field_parsers["xpos"] if "xpos" not in field_parsers and "xpostag" in field_parsers: field_parsers["xpos"] = field_parsers["xpostag"] if "upostag" not in field_parsers and "upos" in field_parsers: field_parsers["upostag"] = field_parsers["upos"] if "upos" not in field_parsers and "upostag" in field_parsers: field_parsers["upos"] = field_parsers["upostag"] line_split = re.split(r"\t| {2,}", line) if len(line_split) == 1: raise ParseException( "Invalid line format, line must contain either tabs or two spaces." ) data = Token() for i, field in enumerate(fields): # Allow parsing CoNNL-U files with fewer columns if i >= len(line_split): break if field in field_parsers: try: value = field_parsers[field](line_split, i) except ParseException as e: raise ParseException( "Failed parsing field '{}': ".format(field) + str(e)) else: value = line_split[i] data[str(field)] = value return data
def parse_int_value(value): if value == '_': return None if fullmatch(INTEGER, value): return int(value) else: raise ParseException( "'{}' is not a valid value for parse_int_value.".format(value))
def parse_id_value(value): if not value or value == '_': return None if fullmatch(ID_SINGLE, value): return int(value) elif fullmatch(ID_RANGE, value): from_, to = value.split("-") from_, to = int(from_), int(to) if to > from_: return (int(from_), "-", int(to)) elif fullmatch(ID_DOT_ID, value): return (int(value.split(".")[0]), ".", int(value.split(".")[1])) raise ParseException("'{}' is not a valid ID.".format(value))
def serialize(self): if not self.token or "id" not in self.token: raise ParseException( "Could not serialize tree, missing 'id' field.") def flatten_tree(root_token, token_list=[]): token_list.append(root_token.token) for child_token in root_token.children: flatten_tree(child_token, token_list) return token_list tokens = flatten_tree(self) tokens = sorted(tokens, key=lambda t: t['id']) tokenlist = TokenList(tokens, self.metadata) return serialize(tokenlist)
def __init__( self, tokens: T.Iterable[Token] = None, metadata: Metadata = None, default_fields: T.Optional[T.Iterable[str]] = None, ): tokens = tokens or [] if not isinstance(tokens, list): raise ParseException( "Can't create TokenList, tokens is not a list.") if len(tokens) > 0 and not isinstance(tokens[0], Token): tokens = [Token(token) for token in tokens] super(TokenList, self).__init__(tokens) self.metadata = metadata or Metadata() self.default_fields = default_fields