Ejemplo n.º 1
0
def split_sent(parsed, validate=True):
    if validate:
        extract_text(parsed, validate)

    result = []
    tokens = []

    for t in parsed:
        tokens.append(t)
        if 'misc' not in t or t['misc'] is None or 'SentenceBreak' not in t[
                'misc']:
            continue

        assert 'Yes' == t['misc']['SentenceBreak']
        result.append(tokens)
        tokens = []

    if len(tokens):
        result.append(tokens)

    for i in range(len(result)):
        r = result[i][-1]
        if 'misc' not in r or r['misc'] is None or 'SpaceAfter' not in r[
                'misc']:
            continue

        del result[i][-1]['misc']['SpaceAfter']
        if not len(result[i][-1]['misc']):
            result[i][-1]['misc'] = None

    return [TokenList(r) for r in result]
 def get_treebank_annotations(urn: str) -> List[TokenList]:
     """ Retrieves annotations from a treebank. """
     cc: CustomCorpus = next(x for x in CustomCorpusService.custom_corpora
                             if x.corpus.source_urn in urn)
     annotations: List[TokenList] = []
     file_name: str = ntpath.basename(cc.file_path)
     cache_file_path: str = os.path.join(Config.TREEBANKS_CACHE_DIRECTORY,
                                         file_name + ".json")
     if os.path.exists(cache_file_path):
         try:
             annotations = [
                 TokenList(tokens=x["tokens"], metadata=x["metadata"])
                 for x in json.loads(
                     FileService.get_file_content(cache_file_path))
             ]
         except ValueError:
             pass
     if not annotations:
         annotations = conllu.parse(
             FileService.get_file_content(cc.file_path))
         # need to cache the result because the syntax parser is so slow
         with open(cache_file_path, "w+") as f:
             f.write(
                 json.dumps(
                     dict(tokens=x.tokens, metadata=x.metadata)
                     for x in annotations))
     if cc.corpus.source_urn != urn:
         # the given URN points to a sub-graph, so we make a selection from our annotations
         annotations = CustomCorpusService.get_treebank_sub_annotations(
             urn, annotations, cc)
     # add an artificial punctuation sign at the end of each sentence
     for sent in annotations:
         sent.metadata["urn"] = ":".join(
             urn.split(":")[:-1] + [sent.tokens[0]["misc"]["ref"]])
         if sent.tokens[-1]["form"] != ".":
             root_token: OrderedDict = next(
                 x for x in sent.tokens if x[Config.AQL_DEPREL] == "root")
             sent.append(
                 OrderedDict([
                     ("id", sent.tokens[-1]["id"] + 1), ("form", "."),
                     ("lemma", "."), ("upostag", "PUNCT"),
                     ("xpostag", None), ("feats", None),
                     ("head", root_token["id"] if root_token else 0),
                     ("deps", None),
                     ("misc",
                      OrderedDict([("ref", sent.tokens[0]["misc"]["ref"])]))
                 ]))
     # add root dependencies as separate node annotations so we can search for them later
     for token_list in annotations:
         for token in token_list:
             if token["head"] == 0:
                 token["deps"] = token[Config.AQL_DEPREL]
     return annotations
Ejemplo n.º 3
0
def read_conllu_file(file_path: str, seg_threshold: bool = True) -> List[TokenList]:
    document = []
    with open(file_path, "r") as file:
        contents = file.read()
        tokenlists = parse(contents)
        if len(tokenlists) == 0:
            print(f"WARNING: {file_path} is empty--likely conversion error.")
            return []
        for annotation in tokenlists:
            m = annotation.metadata
            if seg_threshold and "segmentation" in m and m["segmentation"] not in ["checked", "gold"]:
                print("Skipping " + file_path + " because its segmentation is not checked or gold.")
                return []
            if len(annotation) > 200:
                subannotation = TokenList(annotation[:200])
                subannotation.metadata = annotation.metadata.copy()
                logger.info(f"Breaking up huge sentence in {file_path} with length {len(annotation)} "
                            f"into chunks of 200 norms")
                while len(subannotation) > 0:
                    document.append(subannotation)
                    subannotation = TokenList(subannotation[200:])
                    subannotation.metadata = annotation.metadata.copy()
            else:
                document.append(annotation)
    return document
Ejemplo n.º 4
0
def compile_sent(sent):
    """compile a list of sublists, [word, head, full POS] format, into a CoNLL-U format sentence"""
    sent_list = list()
    for i, tok_data in enumerate(sent):
        tok_id = i + 1
        tok = tok_data[0]
        head = tok_data[1]
        pos = get_pos(tok_data[2])
        feats = get_feats(tok_data[2])
        compiled_tok = OrderedDict({
            'id': tok_id,
            'form': tok,
            'lemma': head,
            'upostag': pos,
            'xpostag': None,
            'feats': feats,
            'head': None,
            'deprel': None,
            'deps': None,
            'misc': None
        })
        sent_list.append(compiled_tok)
    sent_list = TokenList(sent_list).serialize()
    return sent_list
Ejemplo n.º 5
0
def handle_exercise_data(ed: ExerciseData, ctx_left: int,
                         ctx_right: int) -> str:
    """ Constructs an SVG image (for POS and syntactic dependencies) from given annotations. """
    conllu_list: List[TokenList] = []
    dep_links: List[LinkMC] = [
        x for x in ed.graph.links
        if x.annis_component_name == Config.GRAPHANNIS_DEPENDENCY_LINK
    ]
    current_sentence_id: str = "-1"
    salt_id_to_conll_id_dict: Dict[str, str] = {}
    for node in ed.graph.nodes:
        new_sentence_id: str = str(AnnotationService.get_sentence_id(node))
        if new_sentence_id != current_sentence_id:
            update_heads(conllu_list=conllu_list,
                         salt_id_to_conll_id_dict=salt_id_to_conll_id_dict)
            conllu_list.append(
                TokenList(tokens=[],
                          metadata=OrderedDict([("sent_id", new_sentence_id)
                                                ])))
            current_sentence_id = new_sentence_id
            salt_id_to_conll_id_dict = {}
        relevant_link: LinkMC = next(
            (x for x in dep_links if x.target == node.id), None)
        salt_id_to_conll_id_dict[node.id] = str(
            len(conllu_list[-1].tokens) + 1)
        conllu_list[-1].tokens.append({
            "id":
            salt_id_to_conll_id_dict[node.id],
            "form":
            node.annis_tok,
            "lemma":
            node.udep_lemma,
            "upostag":
            node.udep_upostag,
            "xpostag":
            node.udep_xpostag,
            "feats":
            node.udep_feats,
            "head":
            "0" if relevant_link is None else relevant_link.source,
            Config.AQL_DEPREL:
            "root" if (relevant_link is None
                       or not hasattr(relevant_link, "udep_deprel")) else
            relevant_link.udep_deprel,
            "deps":
            None,
            "misc":
            None
        })
    update_heads(conllu_list=conllu_list,
                 salt_id_to_conll_id_dict=salt_id_to_conll_id_dict)
    conllu_string: str = "".join(x.tokens.serialize() for x in conllu_list)
    # generate temp file
    (handle, tmp_file_path) = mkstemp(suffix=".conllu")
    with open(tmp_file_path, "w+") as f:
        f.write(conllu_string)
    conllu2svg_path: str = Config.CONLLU2SVG_PATH_OSX if platform == Config.PLATFORM_MACOS else Config.CONLLU2SVG_PATH_LINUX
    html_bytes: bytes = subprocess.check_output("{0} {1}".format(
        conllu2svg_path, tmp_file_path),
                                                shell=True)
    os.remove(tmp_file_path)
    html_string: str = html_bytes.decode('utf-8')
    svg_list: List[str] = re.findall(r"<svg[\s\S]*?svg>", html_string)
    ret_val: str = "".join(svg_list) + "<br>"
    ret_val = re.sub(r' onclick=".*?"', "", ret_val)
    ret_val = re.sub(r' onmouseover=".*?"', "", ret_val)
    ret_val = re.sub(r' onmouseout=".*?"', "", ret_val)
    return highlight_targets(ret_val, ctx_left, ctx_right, ed)
Ejemplo n.º 6
0
def conllu_parse(json_file, tok_style=2, tagged_only=True):
    """takes the Wb. Glosses from the .json file
       changes their format to match the Sg. Glosses from the .conllu file"""

    # Extract data from the JSON file format
    parse_list = list()
    for level_0 in json_file:
        fol = level_0.get("folios")
        for level_1 in fol:
            fol_col = level_1.get("folio")
            glosses = level_1.get("glosses")
            for level_2 in glosses:
                glossnum = level_2.get("glossNo")
                gloss_text = level_2.get("glossText")
                gloss_trans = level_2.get("glossTrans")
                gloss_hand = level_2.get("glossHand")
                tokens = level_2.get(f"glossTokens{tok_style}")
                # Check that glosses have been tagged before inclusion by ensuring they contain at least one POS which
                # does not appear in Latin/Greek-only glosses.
                if tagged_only:
                    vernacular_pos = ['ADJ', 'ADP', 'AUX', 'DET', 'NOUN', 'NUM',
                                      'PART', 'PRON', 'PROPN', 'SCONJ', 'VERB']
                    if [i for i in [tok[1] for tok in tokens] if i in vernacular_pos]:
                        parse_list.append([fol_col[3:] + glossnum, gloss_text, gloss_trans, gloss_hand,
                                           [[tok[0], tok[1], tok[2], tok[3]] for tok in tokens]])
                else:
                    parse_list.append([fol_col[3:] + glossnum, gloss_text, gloss_trans, gloss_hand,
                                       [[tok[0], tok[1], tok[2], tok[3]] for tok in tokens]])

    # Compile the data into CoNLL_U file format
    conllu_format = None
    for sentnum, sent in enumerate(parse_list):
        sent_id = sentnum + 1
        this_id = f'# sent_id = {sent_id}'
        gloss_id = sent[0]
        ref = f'# reference = {gloss_id}'
        sent_toks = sent[4]
        full_gloss = f'# text = {sent[1]}'
        full_gloss = "".join(full_gloss.split("<em>"))
        full_gloss = "".join(full_gloss.split("</em>"))
        full_gloss = "".join(full_gloss.split("<sup>"))
        full_gloss = "".join(full_gloss.split("</sup>"))
        translation = f'# translation = {sent[2]}'
        translation = "".join(translation.split("<em>"))
        translation = "".join(translation.split("</em>"))
        translation = "".join(translation.split("<sup>"))
        translation = "".join(translation.split("</sup>"))
        hand = f'# scribe = {sent[3]}'
        meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n'
        sent_list = list()
        for i, tok_data in enumerate(sent_toks):
            tok_id = i + 1
            tok = tok_data[0]
            head = tok_data[2]
            if not head:
                head = "_"
            pos = tok_data[1]
            feats = tok_data[3]
            if pos in ["<Latin>", "<Latin CCONJ>", "<Greek>"] and (not feats or feats == "Foreign=Yes"):
                pos = "X"
                feats = "Foreign=Yes"
            elif pos in ["<Latin>", "<Latin CCONJ>"]:
                raise RuntimeError(f"Latin word found with features: {feats}")
            if feats:
                feats = feats.split("|")
                feats = OrderedDict({i.split("=")[0]: i.split("=")[1] for i in feats})
            compiled_tok = OrderedDict({'id': tok_id, 'form': tok, 'lemma': head, 'upostag': pos, 'xpostag': None,
                                        'feats': feats, 'head': None, 'deprel': None, 'deps': None, 'misc': None})
            sent_list.append(compiled_tok)
        sent_list = TokenList(sent_list).serialize()
        if not conllu_format:
            conllu_format = meta + sent_list
        else:
            conllu_format = conllu_format + meta + sent_list
    conllu_format = conllu_format.strip("\n") + "\n"
    conllu_format = parse(conllu_format)
    return conllu_format
                for t in tokenlist:
                    if isinstance(t["id"], tuple):
                        constituent_ids = list(
                            range(t["id"][0], t["id"][2] + 1))
                        t["um_feats"] = merge_attributes([
                            non_contracted_token_dict[x]
                            for x in constituent_ids
                        ], _UNIMORPH_VALUES_ATTRIBUTE)

                        # If this is a contraction, add it
                        final_tokens.append(t)
                    elif t["id"] not in contracted_ids:
                        # Check if this t is part of a contraction
                        final_tokens.append(t)

                final_tokens: TokenList = TokenList(final_tokens)

                # Skip if this would have more than 512 subtokens
                labelled_subwords = subword_tokenize(
                    tokenizer, [t["form"] for t in final_tokens])
                subtoken_indices, subtokens = zip(*labelled_subwords)
                if len(subtokens) >= 512:
                    if "subtoken_count" not in skipped:
                        skipped["subtoken_count"] = 0

                    skipped["subtoken_count"] += 1
                    continue

                if "total_sents" not in skipped:
                    skipped["total_sents"] = 0
Ejemplo n.º 8
0
def remove_foreign(sentence):
    """Divide glosses into separate sentences at points where non-vernacular words occur, return only Irish text"""

    # Create the metadata to be added to each sub-gloss
    meta = get_metadata(sentence)
    this_id = f'# sent_id = {meta.get("sent_id")}'
    ref = f'# reference = {meta.get("reference")}'
    full_gloss = f'# text = {meta.get("text")}'
    translation = f'# translation = {meta.get("translation")}'
    if meta.get("scribe"):
        hand = f'# scribe = {meta.get("scribe")}'
        meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n'
    else:
        meta = f'{this_id}\n{ref}\n{full_gloss}\n{translation}\n'

    split_sents = None
    cur_sent = list()
    tok_id = 0
    # Check each token in a sentence to see if it's non-vernacular
    for tok_num, tok_data in enumerate(sentence):
        tok_id += 1
        tok_data["id"] = tok_id
        feats = tok_data.get("feats")
        if feats:
            # If a non-vernacular word is found (other than latin "et")
            if feats.get("Foreign") == "Yes" and tok_data.get("form") != "et":
                # If a sentence has been compiled up to this point
                if cur_sent:
                    # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
                    # add the metadata to the sentence, and concatenate it with any preceding sentence splits
                    if [
                            i for i in
                        [[tok.get("lemma"),
                          tok.get("upos"),
                          tok.get("feats")] for tok in cur_sent]
                            if i not in exclusions
                    ]:
                        # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                        if [
                                cur_sent[-1].get("lemma"),
                                cur_sent[-1].get("upos"),
                                cur_sent[-1].get("feats")
                        ] in exclusions:
                            while [
                                    cur_sent[-1].get("lemma"),
                                    cur_sent[-1].get("upos"),
                                    cur_sent[-1].get("feats")
                            ] in exclusions:
                                cur_sent = cur_sent[:-1]
                        if not split_sents:
                            split_sents = meta + TokenList(
                                cur_sent).serialize()
                        else:
                            split_sents = split_sents + meta + TokenList(
                                cur_sent).serialize()
                    cur_sent = list()
                tok_id = 0
            # If this is the last token in the sentence and it has features add it to the current split
            elif tok_num == len(sentence) - 1:
                cur_sent.append(tok_data)
                # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
                # add the metadata to the sentence, and concatenate it with any preceding sentence splits
                if [
                        i for i in
                    [[tok.get("lemma"),
                      tok.get("upos"),
                      tok.get("feats")] for tok in cur_sent]
                        if i not in exclusions
                ]:
                    # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                    if [
                            cur_sent[-1].get("lemma"),
                            cur_sent[-1].get("upos"), cur_sent[-1].get("feats")
                    ] in exclusions:
                        while [
                                cur_sent[-1].get("lemma"),
                                cur_sent[-1].get("upos"),
                                cur_sent[-1].get("feats")
                        ] in exclusions:
                            cur_sent = cur_sent[:-1]
                    if not split_sents:
                        split_sents = meta + TokenList(cur_sent).serialize()
                    else:
                        split_sents = split_sents + meta + TokenList(
                            cur_sent).serialize()
                cur_sent = list()
                tok_id = 0
            else:
                cur_sent.append(tok_data)
        # If this is the last token in the sentence but it has no features add it to the current split
        elif tok_num == len(sentence) - 1:
            cur_sent.append(tok_data)
            # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
            # add the metadata to the sentence, and concatenate it with any preceding sentence splits
            if [
                    i for i in
                [[tok.get("lemma"),
                  tok.get("upos"),
                  tok.get("feats")] for tok in cur_sent] if i not in exclusions
            ]:
                # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                if [
                        cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"),
                        cur_sent[-1].get("feats")
                ] in exclusions:
                    while [
                            cur_sent[-1].get("lemma"),
                            cur_sent[-1].get("upos"), cur_sent[-1].get("feats")
                    ] in exclusions:
                        cur_sent = cur_sent[:-1]
                if not split_sents:
                    split_sents = meta + TokenList(cur_sent).serialize()
                else:
                    split_sents = split_sents + meta + TokenList(
                        cur_sent).serialize()
            cur_sent = list()
            tok_id = 0
        else:
            cur_sent.append(tok_data)

    if not split_sents:
        split_sents = []
    else:
        split_sents = split_sents.strip("\n") + "\n"
        split_sents = parse(split_sents)

    # Renumber each substring's sentence ID to separately identify any splits made
    for i, sub_sent in enumerate(split_sents):
        meta = get_metadata(sub_sent)
        meta["sent_id"] = f'{meta.get("sent_id")}.{i}'

    return split_sents
Ejemplo n.º 9
0
def clean_punct(sentence):
    """Remove various forms of punctuation as necessary to split glosses into sentences"""

    # Check if any POS in the sentence identifies punctuation. If not, return the sentence unchanged.
    unknown_check = get_pos(sentence)
    if "PUNCT" not in unknown_check:
        return [sentence]
    else:

        # Create the metadata to be added to each sub-gloss
        meta = get_metadata(sentence)
        this_id = f'# sent_id = {meta.get("sent_id")}'
        ref = f'# reference = {meta.get("reference")}'
        full_gloss = f'# text = {meta.get("text")}'
        translation = f'# translation = {meta.get("translation")}'
        if meta.get("scribe"):
            hand = f'# scribe = {meta.get("scribe")}'
            meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n'
        else:
            meta = f'{this_id}\n{ref}\n{full_gloss}\n{translation}\n'

        # List punctuation types which can be removed without splitting a sentence
        removables = [",", ":"]
        keepables = ["·"]

        split_sents = None
        cur_sent = list()
        tok_id = 0
        # Check each POS in a sentence to see if it's unknown
        for tok_num, tok_data in enumerate(sentence):
            tok_id += 1
            tok_data["id"] = tok_id
            pos = tok_data.get("upos")
            lemma = tok_data.get("lemma")
            # If a POS identifies punctuation that splits a sentence
            if pos == "PUNCT" and lemma not in removables and lemma not in keepables:
                # If a sentence has been compiled up to this point
                if cur_sent:
                    # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
                    # add the metadata to the sentence, and concatenate it with any preceding sentence splits
                    if [
                            i for i in
                        [[tok.get("lemma"),
                          tok.get("upos"),
                          tok.get("feats")] for tok in cur_sent]
                            if i not in exclusions
                    ]:
                        # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                        if [
                                cur_sent[-1].get("lemma"),
                                cur_sent[-1].get("upos"),
                                cur_sent[-1].get("feats")
                        ] in exclusions:
                            while [
                                    cur_sent[-1].get("lemma"),
                                    cur_sent[-1].get("upos"),
                                    cur_sent[-1].get("feats")
                            ] in exclusions:
                                cur_sent = cur_sent[:-1]
                        if not split_sents:
                            split_sents = meta + TokenList(
                                cur_sent).serialize()
                        else:
                            split_sents = split_sents + meta + TokenList(
                                cur_sent).serialize()
                    cur_sent = list()
                tok_id = 0
            # If a POS identifies punctuation that can be removed without splitting a sentence
            elif pos == "PUNCT" and lemma in removables:
                if tok_num == len(sentence) - 1:
                    # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
                    # add the metadata to the sentence, and concatenate it with any preceeding sentence splits
                    if [
                            i for i in
                        [[tok.get("lemma"),
                          tok.get("upos"),
                          tok.get("feats")] for tok in cur_sent]
                            if i not in exclusions
                    ]:
                        # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                        if [
                                cur_sent[-1].get("lemma"),
                                cur_sent[-1].get("upos"),
                                cur_sent[-1].get("feats")
                        ] in exclusions:
                            while [
                                    cur_sent[-1].get("lemma"),
                                    cur_sent[-1].get("upos"),
                                    cur_sent[-1].get("feats")
                            ] in exclusions:
                                cur_sent = cur_sent[:-1]
                        if not split_sents:
                            split_sents = meta + TokenList(
                                cur_sent).serialize()
                        else:
                            split_sents = split_sents + meta + TokenList(
                                cur_sent).serialize()
                    cur_sent = list()
                    tok_id = 0
                else:
                    tok_id -= 1
            # If this is the last token in the sentence add it to the current split
            elif tok_num == len(sentence) - 1:
                cur_sent.append(tok_data)
                # If all of the words in the compiled sentence are valid, Irish words (not exclusions)
                # add the metadata to the sentence, and concatenate it with any preceeding sentence splits
                if [
                        i for i in
                    [[tok.get("lemma"),
                      tok.get("upos"),
                      tok.get("feats")] for tok in cur_sent]
                        if i not in exclusions
                ]:
                    # If any split sentence ends with an invalid word (an exclusion) remove it from the end
                    if [
                            cur_sent[-1].get("lemma"),
                            cur_sent[-1].get("upos"), cur_sent[-1].get("feats")
                    ] in exclusions:
                        while [
                                cur_sent[-1].get("lemma"),
                                cur_sent[-1].get("upos"),
                                cur_sent[-1].get("feats")
                        ] in exclusions:
                            cur_sent = cur_sent[:-1]
                    if not split_sents:
                        split_sents = meta + TokenList(cur_sent).serialize()
                    else:
                        split_sents = split_sents + meta + TokenList(
                            cur_sent).serialize()
                cur_sent = list()
                tok_id = 0
            else:
                cur_sent.append(tok_data)

        if not split_sents:
            split_sents = []
        else:
            split_sents = split_sents.strip("\n") + "\n"
            split_sents = parse(split_sents)

        # Renumber each substring's sentence ID to separately identify any splits made
        for i, sub_sent in enumerate(split_sents):
            meta = get_metadata(sub_sent)
            meta["sent_id"] = f'{meta.get("sent_id")}.{i}'

        return split_sents
Ejemplo n.º 10
0
    def annotate_sample(example_conllu: conllu.TokenList, lang: str, use_triggers: bool = True) -> List[
        AnnotatedSample]:
        sent, metadata = parse_conllu(example_conllu.serialize())

        assert len(sent) == len(metadata) == 1
        sent = sent[0]
        _ = sent.pop(0)  # for internal use, we remove the stub root-node
        metadata = metadata[0]

        tokens = [node.get_conllu_field("form") for node in sent.values()]
        tags = [node.get_conllu_field("xpos") for node in sent.values()]
        lemmas = [node.get_conllu_field("lemma") for node in sent.values()]
        entities = get_entities(example_conllu)
        chunks = ["O"] * len(tokens)  # chunks - not interesting

        # create a networkX graph from the returned graph. one multiDi and one not - for later use.
        g = nx.Graph()
        mdg = nx.MultiDiGraph()
        for node in sent.values():
            for parent, label in node.get_new_relations():
                if parent.get_conllu_field("id") == 0:
                    continue

                # TODO: Why (-1)
                g.add_node(parent.get_conllu_field("id") - 1, label=parent.get_conllu_field("form"))
                g.add_node(node.get_conllu_field("id") - 1, label=node.get_conllu_field("form"))
                g.add_edge(parent.get_conllu_field("id") - 1, node.get_conllu_field("id") - 1, label=label)
                mdg.add_node(parent.get_conllu_field("id") - 1, label=parent.get_conllu_field("form"))
                mdg.add_node(node.get_conllu_field("id") - 1, label=node.get_conllu_field("form"))
                mdg.add_edge(parent.get_conllu_field("id") - 1, node.get_conllu_field("id") - 1, label=label)

        # add an annotated sample to the list for each trigger on the path
        rel = example_conllu.metadata['relation']
        ann_samples = []
        trigger_toks = search_triggers(int(example_conllu.metadata['subj_start']),
                                       int(example_conllu.metadata['subj_end']),
                                       int(example_conllu.metadata['obj_start']),
                                       int(example_conllu.metadata['obj_end']),
                                       rel, tokens, lang) if use_triggers else []

        # The AnnotatedSample class expect exclusive range that start from 0
        trigger_tokens_fixed = []
        for trigger_range in trigger_toks:
            (start, end) = trigger_range
            trigger_tokens_fixed.append((start - 1, end))

        if len(trigger_tokens_fixed) == 0:
            trigger_tokens_fixed = [None]

        for trigger_tok in trigger_tokens_fixed:
            ann_samples.append(
                AnnotatedSample(
                    " ".join(tokens),
                    " ".join(tokens),
                    rel,
                    example_conllu.metadata['subj_type'].title(),
                    example_conllu.metadata['obj_type'].title(),
                    tokens, tags, entities, chunks, lemmas,
                    # add -1 to start as the AnnotatedSample expect an exclusive range that start from 0
                    # and the conllu on start from 1 and is inclusive
                    (int(example_conllu.metadata['subj_start']) - 1, int(example_conllu.metadata['subj_end'])),
                    (int(example_conllu.metadata['obj_start']) - 1, int(example_conllu.metadata['obj_end'])),
                    trigger_tok,
                    g, mdg))

        assert len(ann_samples) != 0

        return ann_samples