Example #1
0
 def add_dependency(dep_parent, dep_child, dep_label,
                    dep_type, data_pack_):
     """Adds dependency to a data_pack
     Args:
         dep_parent: dependency parent token
         dep_child: dependency child token
         dep_label: dependency label
         dep_type: "primary" or "enhanced" dependency
         data_pack_: data_pack to which the
         dependency is to be added
     """
     dependency = Dependency(data_pack, dep_parent,
                             dep_child)
     dependency.dep_label = dep_label
     dependency.type = dep_type
     data_pack_.add_or_get_entry(dependency)
Example #2
0
    def _parse_pack(self, doc_lines) -> Iterator[DataPack]:
        token_comp_fields = ["id", "form", "lemma", "pos",
                             "ud_xpos", "ud_features", "head", "label",
                             "enhanced_dependency_relations", "ud_misc"]

        token_multi_fields = ["ud_features", "ud_misc",
                              "enhanced_dependency_relations"]

        token_feature_fields = ["ud_features", "ud_misc"]

        data_pack: DataPack = DataPack()
        doc_sent_begin: int = 0
        doc_num_sent: int = 0
        doc_text: str = ''
        doc_offset: int = 0
        doc_id: str

        sent_text: str
        sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {}

        for line in doc_lines:
            line = line.strip()
            line_comps = line.split()

            if line.startswith("# newdoc"):
                doc_id = line.split("=")[1].strip()

            elif line.startswith("# sent"):
                sent_text = ''

            elif len(line_comps) > 0 and \
                    line_comps[0].strip().isdigit():
                # token
                token_comps: Dict[str, Any] = {}

                for index, key in enumerate(token_comp_fields):
                    token_comps[key] = str(line_comps[index])

                    if key in token_multi_fields:
                        values = str(token_comps[key]).split("|") \
                            if token_comps[key] != '_' else []
                        if key not in token_feature_fields:
                            token_comps[key] = values
                        else:
                            feature_lst = [elem.split('=', 1)
                                           for elem in values]
                            feature_dict = {elem[0]: elem[1]
                                            for elem in feature_lst}
                            token_comps[key] = feature_dict

                word: str = token_comps["form"]
                word_begin = doc_offset
                word_end = doc_offset + len(word)

                # add token
                token: Token = Token(data_pack, word_begin, word_end)

                token.lemma = token_comps['lemma']
                token.pos = token_comps['pos']
                token.ud_xpos = token_comps['ud_xpos']
                token.ud_features = token_comps['ud_features']
                token.ud_misc = token_comps['ud_misc']

                sent_tokens[str(token_comps["id"])] = (token_comps, token)

                sent_text += word + " "
                doc_offset = word_end + 1

            elif line == "":
                # sentence ends
                sent_text = sent_text.strip()
                doc_text += ' ' + sent_text

                # add dependencies for a sentence when all the tokens have been
                # added
                for token_id in sent_tokens:
                    token_comps, token = sent_tokens[token_id]

                    # add primary dependency
                    label = token_comps["label"]
                    if label == "root":
                        token.is_root = True
                    else:
                        token.is_root = False
                        head = sent_tokens[token_comps["head"]][1]
                        dependency = Dependency(data_pack, head, token)
                        dependency.dep_label = label

                    # add enhanced dependencies
                    for dep in token_comps["enhanced_dependency_relations"]:
                        head_id, label = dep.split(":", 1)
                        if label != "root":
                            head = sent_tokens[head_id][1]
                            enhanced_dependency = \
                                EnhancedDependency(data_pack, head, token)
                            enhanced_dependency.dep_label = label

                # add sentence
                Sentence(data_pack, doc_sent_begin, doc_offset - 1)

                doc_sent_begin = doc_offset
                doc_num_sent += 1

        doc_text = doc_text.strip()
        data_pack.set_text(doc_text)

        # add doc to data_pack
        Document(data_pack, 0, len(doc_text))
        data_pack.pack_name = doc_id

        yield data_pack