Ejemplo n.º 1
0
    def head_to_token(sentence: 'TokenList') -> T.Dict[int, T.List[Token]]:
        if not sentence:
            raise ParseException(
                "Can't parse tree, need a tokenlist as input.")

        if "head" not in sentence[0]:
            raise ParseException("Can't parse tree, missing 'head' field.")

        head_indexed = defaultdict(list)
        for token in sentence:
            # Filter out range and decimal ID:s before building tree
            if "id" in token and not isinstance(token["id"], int):
                continue

            # Filter out tokens with negative head, they are sometimes used to
            # specify tokens which should not be included in tree
            # Also filter out those that have no head, just exclude them from the tree.
            if (token.get("head") is None) or token["head"] < 0:
                continue

            head_indexed[token["head"]].append(token)

        if len(head_indexed[0]) == 0:
            raise ParseException("Found no head node, can't build tree")

        return head_indexed
Ejemplo n.º 2
0
    def print_tree(
            self,
            depth: int = 0,
            indent: int = 4,
            exclude_fields: T.Sequence[str] = DEFAULT_EXCLUDE_FIELDS) -> None:
        if not self.token:
            raise ParseException("Can't print, token is None.")

        if "deprel" not in self.token or "id" not in self.token:
            raise ParseException(
                "Can't print, token is missing either the id or deprel fields."
            )

        relevant_data = self.token.copy()
        for key in exclude_fields:
            if key in relevant_data:
                del relevant_data[key]

        node_repr = ' '.join([
            '{key}:{value}'.format(key=key, value=value)
            for key, value in relevant_data.items()
        ])

        print(' ' * indent * depth +
              '(deprel:{deprel}) {node_repr} [{idx}]'.format(
                  deprel=self.token['deprel'],
                  node_repr=node_repr,
                  idx=self.token['id'],
              ))
        for child in self.children:
            child.print_tree(depth=depth + 1,
                             indent=indent,
                             exclude_fields=exclude_fields)
Ejemplo n.º 3
0
    def head_to_token(sentence):
        if not sentence:
            raise ParseException(
                "Can't parse tree, need a tokenlist as input.")

        if "head" not in sentence[0]:
            raise ParseException("Can't parse tree, missing 'head' field.")

        head_indexed = defaultdict(list)
        for token in sentence:
            # Filter out range and decimal ID:s before building tree
            if "id" in token and not isinstance(token["id"], int):
                continue

            # Filter out tokens with negative head, they are sometimes used to
            # specify tokens which should not be included in tree
            if token["head"] < 0:
                continue

            head_indexed[token["head"]].append(token)

        if len(head_indexed[0]) == 0:
            raise ParseException("Found no head node, can't build tree")

        if len(head_indexed[0]) > 1:
            raise ParseException(
                "Can't parse tree, found multiple root nodes.")

        return head_indexed
Ejemplo n.º 4
0
def parse_token_and_metadata(data,
                             fields=None,
                             field_parsers=None,
                             metadata_parsers=None):
    if not data:
        raise ParseException(
            "Can't create TokenList, no data sent to constructor.")

    fields = fields or DEFAULT_FIELDS

    if not field_parsers:
        field_parsers = DEFAULT_FIELD_PARSERS.copy()
    elif sorted(field_parsers.keys()) != sorted(fields):
        new_field_parsers = DEFAULT_FIELD_PARSERS.copy()
        new_field_parsers.update(field_parsers)
        field_parsers = new_field_parsers

    tokens = []
    metadata = Metadata()

    for line in data.split('\n'):
        line = line.strip()

        if not line:
            continue

        if line.startswith('#'):
            pairs = parse_comment_line(line, metadata_parsers=metadata_parsers)
            for key, value in pairs:
                metadata[key] = value
        else:
            tokens.append(parse_line(line, fields, field_parsers))

    return tokens, metadata
Ejemplo n.º 5
0
def parse_comment_line(line, metadata_parsers=None):
    line = line.strip()

    if line[0] != '#':
        raise ParseException(
            "Invalid comment format, comment must start with '#'")

    key, value = parse_pair_value(line[1:])

    if not metadata_parsers:
        metadata_parsers = DEFAULT_METADATA_PARSERS.copy()
    else:
        new_metadata_parsers = DEFAULT_METADATA_PARSERS.copy()
        new_metadata_parsers.update(metadata_parsers)
        metadata_parsers = new_metadata_parsers

    custom_result = None
    if key in metadata_parsers:
        custom_result = metadata_parsers[key](key, value)
    elif "__fallback__" in metadata_parsers:
        custom_result = metadata_parsers["__fallback__"](key, value)

    # Allow returning pair instead of list of pairs from metadata parsers
    if custom_result:
        if isinstance(custom_result, tuple):
            key, value = custom_result
            return [(text(key), value)]
        return [(text(key), value) for key, value in custom_result]

    if not key or not value:
        # Lines without value are invalid by default
        return []

    return [(text(key), value)]
Ejemplo n.º 6
0
def serialize_field(field: T.Any) -> str:
    if field is None:
        return '_'

    if isinstance(field, dict):
        fields = []
        for key, value in field.items():
            if value is None:
                value = "_"

            fields.append('='.join((key, value)))

        return '|'.join(fields)

    if isinstance(field, tuple):
        return "".join([serialize_field(item) for item in field])

    if isinstance(field, list):
        if len(field[0]) != 2:
            raise ParseException(
                "Can't serialize '{}', invalid format".format(field))
        return "|".join(
            [serialize_field(value) + ":" + str(key) for key, value in field])

    return "{}".format(field)
Ejemplo n.º 7
0
    def __init__(self, tokens: T.Iterable[Token], metadata: Metadata = None):
        super(TokenList, self).__init__(tokens)
        if not isinstance(tokens, list):
            raise ParseException(
                "Can't create TokenList, tokens is not a list.")

        self.metadata = metadata or Metadata()
Ejemplo n.º 8
0
def parse_line(
        line: str,
        fields: T.Sequence[str],
        field_parsers: T.Optional[T.Dict[str,
                                         _FieldParserType]] = None) -> Token:
    # Be backwards compatible if people called parse_line without field_parsers before
    field_parsers = field_parsers or DEFAULT_FIELD_PARSERS

    # Support xpostag/upostag as aliases for xpos/upos (both ways)
    if "xpostag" not in field_parsers and "xpos" in field_parsers:
        field_parsers["xpostag"] = field_parsers["xpos"]
    if "xpos" not in field_parsers and "xpostag" in field_parsers:
        field_parsers["xpos"] = field_parsers["xpostag"]

    if "upostag" not in field_parsers and "upos" in field_parsers:
        field_parsers["upostag"] = field_parsers["upos"]
    if "upos" not in field_parsers and "upostag" in field_parsers:
        field_parsers["upos"] = field_parsers["upostag"]

    line_split = re.split(r"\t| {2,}", line)

    if len(line_split) == 1:
        raise ParseException(
            "Invalid line format, line must contain either tabs or two spaces."
        )

    data = Token()

    for i, field in enumerate(fields):
        # Allow parsing CoNNL-U files with fewer columns
        if i >= len(line_split):
            break

        if field in field_parsers:
            try:
                value = field_parsers[field](line_split, i)
            except ParseException as e:
                raise ParseException(
                    "Failed parsing field '{}': ".format(field) + str(e))

        else:
            value = line_split[i]

        data[str(field)] = value

    return data
Ejemplo n.º 9
0
def parse_int_value(value):
    if value == '_':
        return None

    if fullmatch(INTEGER, value):
        return int(value)
    else:
        raise ParseException(
            "'{}' is not a valid value for parse_int_value.".format(value))
Ejemplo n.º 10
0
def parse_id_value(value):
    if not value or value == '_':
        return None

    if fullmatch(ID_SINGLE, value):
        return int(value)

    elif fullmatch(ID_RANGE, value):
        from_, to = value.split("-")
        from_, to = int(from_), int(to)
        if to > from_:
            return (int(from_), "-", int(to))

    elif fullmatch(ID_DOT_ID, value):
        return (int(value.split(".")[0]), ".", int(value.split(".")[1]))

    raise ParseException("'{}' is not a valid ID.".format(value))
Ejemplo n.º 11
0
    def serialize(self):
        if not self.token or "id" not in self.token:
            raise ParseException(
                "Could not serialize tree, missing 'id' field.")

        def flatten_tree(root_token, token_list=[]):
            token_list.append(root_token.token)

            for child_token in root_token.children:
                flatten_tree(child_token, token_list)

            return token_list

        tokens = flatten_tree(self)
        tokens = sorted(tokens, key=lambda t: t['id'])
        tokenlist = TokenList(tokens, self.metadata)

        return serialize(tokenlist)
Ejemplo n.º 12
0
    def __init__(
        self,
        tokens: T.Iterable[Token] = None,
        metadata: Metadata = None,
        default_fields: T.Optional[T.Iterable[str]] = None,
    ):
        tokens = tokens or []

        if not isinstance(tokens, list):
            raise ParseException(
                "Can't create TokenList, tokens is not a list.")

        if len(tokens) > 0 and not isinstance(tokens[0], Token):
            tokens = [Token(token) for token in tokens]

        super(TokenList, self).__init__(tokens)

        self.metadata = metadata or Metadata()
        self.default_fields = default_fields