Exemple #1
0
def lex_table_fix(lex_table, counter,
                  lex_output_path):  # need a better name for this method
    """this method will take the lex table and check if there are duplicated rows with different word_vars,
    pos tagging the word in word_standard and generating search patterns for DDM tagging."""

    v_dict = collections.OrderedDict()  # ordered output
    stem_c = dict()  # maybe there is a better data structure
    lemma_c = dict()
    standard_c = dict()
    variant_c = dict()
    dict_count_list = [stem_c, lemma_c, standard_c, variant_c]
    # check for
    with open(output_file_lex, 'w', newline="") as lex_csv:
        csv_writer = csv.writer(lex_csv,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow([
            'word_stem', 'word_lemma', 'word_standard', 'word_variant',
            'word_vars', 'word_english', 'POS_tag', 'word_MHG',
            'word_stem_freq', 'word_lemma_freq', 'word_standard_freq',
            'word_variant_freq'
        ])
    lex_csv.close()

    for r in zip(
            lex_table['word_stem'], lex_table['word_lemma'],
            lex_table['word_standard'], lex_table['word_variant'],
            lex_table['word_vars'], lex_table['word_english'],
            lex_table['POS_tag'], lex_table['word_MHG']
    ):  # later there will not be a pos corr column!!! remember to change the code
        key = tuple(
            " " if isinstance(i, float) and math.isnan(i) else i for i in
            r[:4])  # key: word_stem, word_lemma, word_standard, word_variant
        value = tuple(
            " " if isinstance(i, float) and math.isnan(i) else i
            for i in r[4:])  # value: word_vars, word_english, word_MHG
        # is there a way to avoid repeated tagging on the same standard words?
        # pre-prossessing the word_vars, checking for word_vars that does not end with 's' or 'd'
        skip = False
        word_vars = value[0].split(
        )  # get rid of dangling whitespaces and multiple word_vars
        word_std = key[2]
        word_variant = key[3]
        if "*" in word_variant and "*" not in word_std:
            with open(date + "_*_check.txt", "a+") as file:
                print(key[3], key[2], value, file=file)
        elif "*" in word_std and "*" not in word_variant:
            with open(date + "_*_check.txt", "a+") as file:
                print(key[3], key[2], value, file=file)
        AIS_check_list = ["ei", "êi", "ôi"]
        if any(ais in key[3] for ais in AIS_check_list):
            if not any("AIS" in wv for wv in word_vars):
                with open(date + "_check_ais.txt", "a+") as file:
                    print(key[3], value, file=file)
        for wv in word_vars:
            if not (wv.endswith("s") or wv.endswith("d")):
                skip = True
                with open(date + "_wrong_word_vars_ds.txt", "a+") as file:
                    print(key[3], value, file=file)
        if skip:
            continue
        # update word_vars dictionary
        if key not in v_dict.keys():
            v_dict[key] = [value]
        else:  # if the key exist in dictionary.
            append = True
            for v in v_dict[
                    key]:  # check each value to see if the value existed.
                if v[0] == value[
                        0]:  # only check if the new word_vars is the same as the existing one.
                    append = False
                    print(key, value)
                    break
            if append:
                v_dict[key].append(value)

    for key in v_dict.keys():
        # unused variables
        variant = key[3]
        # write a function for the lower part with the four things as argument then use *key. to make things easier
        variant_pattern = compile_pattern(variant, v_dict[key][0])
        variant_count = get_count(variant_pattern,
                                  counter)  # later can be changed to new_count
        new_count = variant_count[
            0]  # later can be removed when the get count function no longer needs to return word
        for key_word, d in zip(
                list(key), dict_count_list):  # stem, lemma, standard, variant
            count_update(key_word, d,
                         new_count)  # don't know how well this will work
    for k, v in v_dict.items():
        stem_freq = stem_c[k[0]]
        lemma_freq = lemma_c[k[1]]
        standard_freq = standard_c[k[2]]
        variant_freq = variant_c[k[3]]
        tags = [t[0] for t in v]
        DDM_tag = " ".join(set(" ".join(tags).split()))
        if not DDM_tag:
            DDM_tag = " "
        line = (*k, DDM_tag, *v[0][1:], stem_freq, lemma_freq, standard_freq,
                variant_freq)
        append_to_lex(*line)
Exemple #2
0
    def read_from_textgrid(self, file_list):
        pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
        lex_table = read_lex_table(lex_table_path)
        variant_match = dict()
        for r in zip(lex_table['word_variant'], lex_table['word_standard'],
                     lex_table['word_vars'], lex_table['POS_tag']):
            # dict with variant as key.
            # if no match tag the thing
            v_pattern = compile_pattern(r[0])
            if v_pattern not in variant_match.keys():
                variant_match[v_pattern] = []
            else:
                print(v_pattern)  # add it? no
            variant_match[v_pattern].append(r)
        gehen_variants = set()
        locations = lex_table.loc[lex_table['word_lemma'] == 'gehen']
        for gehen_var in zip(locations['word_variant'],
                             locations['word_vars']):
            if "SAF5" not in gehen_var[1]:
                g_pattern = compile_pattern(gehen_var[0])
                gehen_variants.add(g_pattern)
        # for gehen_row in lex_table.loc[lex_table['word_lemma'] == 'gehen']['word_variant']:
        #     # check the word_vars
        #     if not any("SAF5" in wv for wv in lex_table.loc[lex_table['word_variant'] == gehen_row]['word_vars']):
        #         g_pattern = compile_pattern(gehen_row)
        #         gehen_variants.add(g_pattern)
        for each_file_name in file_list:
            # now combine the files of the same speakers
            print(each_file_name)
            interval_num = 0
            file_path = self.tg_path + each_file_name
            try:
                file_textgrid_obj = textgrid.TextGrid.fromFile(file_path)
            except UnicodeDecodeError:
                print(each_file_name +
                      ': the encode is weird, not utf-8 or ansi')

            tier_list = file_textgrid_obj.tiers

            for each_tier in tier_list:
                if each_tier.name == 'SWG':  # read from swg tier
                    tier_swg = each_tier
                    intervals_swg = tier_swg.intervals

            try:
                clauses = []
                clause_annotation = []
                time_segment = dict()
                skip = False
                begin_tag = ''
                for each_annotation in intervals_swg:
                    annotation_mark = each_annotation.mark
                    beg_hms = timestamp_convert(each_annotation.minTime)
                    if not annotation_mark.strip(): continue
                    punct = [',', '.', '!', '?']  # maybe just . ! ?
                    tokens = annotation_mark.split()
                    time_segment[beg_hms] = tokens
                    for token in tokens:
                        if any(p in token for p in punct
                               ):  # function that turn segments into clauses
                            if all(c in string.punctuation for c in token
                                   ):  # this is for token like ... --- and ???
                                if not clause_annotation:
                                    time_stamp = beg_hms
                                clause_annotation.append(token)
                                if len(
                                        token
                                ) > 3 or token in punct:  # why do I do this again, still don't know
                                    clause_annotation.append(time_stamp)
                                    clauses.append(clause_annotation)
                                    clause_annotation = []
                                continue

                            word_punct_split = re.findall(
                                r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]",
                                token,
                                re.UNICODE)  # separate word with punctuation

                            for wp in word_punct_split:  # maybe to split annotations into clauses
                                if not clause_annotation:
                                    time_stamp = beg_hms
                                clause_annotation.append(wp)
                                if all(c in punct for c in wp):
                                    clause_annotation.append(time_stamp)
                                    clauses.append(clause_annotation)
                                    clause_annotation = []
                        else:
                            if not clause_annotation:
                                time_stamp = beg_hms
                            clause_annotation.append(token)
                for cl in clauses:
                    if '[ANT]' in cl or '[REL]' in cl:
                        # print("clause", cl)
                        beg_hms = cl[-1]
                        # print("time", beg_hms)
                        cl = cl[:-1]
                        # print("cl", cl)
                        if cl[0] not in time_segment[
                                beg_hms]:  # closer  remaining is the punctuation problem
                            segment_annotation = []
                            for token in time_segment[beg_hms]:
                                segment_annotation += re.findall(
                                    r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]",
                                    token, re.UNICODE)
                            if cl[0] not in segment_annotation:
                                print(segment_annotation)
                                print(cl[0])
                        else:
                            segment_annotation = time_segment[beg_hms]
                        sym_seq = segment_annotation.index(cl[0]) + 1
                        words_std = []
                        ddm_tags = []
                        pos_sent = []

                        # get ddm
                        for i, word in enumerate(cl):
                            if word:  # empty word check
                                # match w with word_variant
                                std_list = set()
                                ddm_list = set()
                                pos_list = set()
                                no_match = True
                                rel = False
                                # check for var: REL
                                if i + 1 < len(
                                        cl):  # make sure next word exist
                                    w_next = cl[i + 1]
                                    if "[REL]" in w_next:
                                        rel = True
                                        if "wo" in word:
                                            rel_var = " RELd"
                                        elif "als" in word or word.startswith(
                                                "d") or word.startswith(
                                                    "wel") or word.startswith(
                                                        "jed"):
                                            rel_var = " RELs"
                                        elif ("was" in word) or (
                                                "wie" in word) or ("wer"
                                                                   in word):
                                            rel_var = " RLOs"
                                        else:
                                            rel_var = " UNK"
                                for p in variant_match.keys():
                                    if p.search(word) is not None:  # .lower()
                                        no_match = False
                                        for values in variant_match[p]:
                                            swg = values[0].replace("*", "")
                                            # rum[ge]draat
                                            if "ge" in swg and "ge" not in word:
                                                swg = swg.replace(
                                                    "ge", "g"
                                                )  # for gespielt gspielt
                                            std = values[1].replace("*", "")
                                            std_list.add(std)
                                            if isinstance(
                                                    values[2], float
                                            ) and math.isnan(
                                                    values[2]
                                            ):  # check for empty var_code
                                                pass  # do nothing
                                            else:
                                                ddm_list.add(
                                                    values[2])  # should be set
                                            if isinstance(
                                                    values[3],
                                                    float) and math.isnan(
                                                        values[3]):
                                                pos_list.add('*')
                                            else:
                                                pos_list.add(values[3])
                                if no_match:
                                    standard = word
                                    ddm = "*"
                                    pos = pos_tagger.tag([word])[0][1]
                                    if "$" in pos:
                                        pos = "*"
                                else:
                                    standard = " ".join(std_list)
                                    ddm = " ".join(str(d) for d in ddm_list)
                                    if any("SAF5" in d for d in ddm_list):
                                        for g_pattern in gehen_variants:
                                            if g_pattern.search(
                                                    word) is not None:
                                                print(ddm)
                                                print(word)
                                                print(
                                                    "!"
                                                )  # gegang* [ge]gang* will be taged as SAF5
                                                # k as prefix
                                                ddm = ddm.replace("SAF5d", "")
                                                ddm = ddm.replace("SAF5s", "")
                                                print(ddm)
                                    pos = " ".join(str(p) for p in pos_list)
                                if rel:
                                    if ddm != "*":
                                        ddm = ddm + rel_var
                                    else:
                                        ddm = rel_var
                                    ddm = ddm.strip()
                            words_std.append(standard)
                            ddm_tags.append(ddm)
                            pos_sent.append(pos)
                        # columns
                        self.output_as_csv(
                            each_file_name[each_file_name.rfind("_") + 1:-9],
                            beg_hms, sym_seq, " ".join(cl), " ".join(ddm_tags),
                            " ".join(pos_sent))
            except AttributeError as e:
                print(each_file_name +
                      ': tier words is empty or does not exist ')
                traceback.print_tb(e.__traceback__)
Exemple #3
0
def create_word_csv(speaker_paths, word_extract_path, lex_table,
                    filename_annotation_map, file_timeseg_map):
    variant_match = dict()
    pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
    # create csv when there is no csv files
    if not os.path.exists(
            word_extract_path):  # if the csv does not exist, create the csv
        with open(word_extract_path, 'w', newline="") as word_extrac_csv:
            csv_writer = csv.writer(word_extrac_csv,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow([
                'trans_id', 'beg_hms', 'sym_seq', 'Word_SWG', 'var_code',
                'Word_German', 'POS_tag'
            ])
        word_extrac_csv.close()
    for r in zip(lex_table['word_variant'], lex_table['word_standard'],
                 lex_table['word_vars'], lex_table['POS_tag']):
        # dict with variant as key.
        # if no match tag the thing
        v_pattern = compile_pattern(r[0], r[2])
        if v_pattern not in variant_match.keys():
            variant_match[v_pattern] = []

        variant_match[v_pattern].append(r)
    for w_var, w_val in variant_match.items():
        if len(w_val) > 1:
            print(w_var, w_val)
    # check if the word's lemma is gehen. If it is, then don't tag the word as SAF5
    gehen_variants = set()
    locations = lex_table.loc[lex_table['word_lemma'] == 'gehen']
    for gehen_var in zip(locations['word_variant'], locations['word_vars']):
        if "SAF5" not in gehen_var[1]:
            g_pattern = compile_pattern(gehen_var[0], gehen_var[1])
            gehen_variants.add(g_pattern)

    # get speaker file names
    for speaker in speaker_paths:
        file_list = [
            file for file in os.listdir(speaker) if file.endswith('.TextGrid')
        ]
        for file_name in file_list:
            outputs = []
            annotations = filename_annotation_map[file_name]
            time_seg_map = file_time_seg_map[file_name]
            # now time stamps and word_count
            for word_annotation in annotations:
                beg_hms = word_annotation[-1]
                word_annotation = word_annotation[:-1]
                original_segment = time_seg_map[beg_hms]
                pointer_orgseg = 0
                for i, w in enumerate(word_annotation):
                    if w:  # empty word check
                        # print(w)
                        sym_seq = None
                        for org_idx, t in enumerate(
                                original_segment
                        ):  # this is for the word count
                            if sym_seq is not None:
                                break
                            words = word_filter(t)
                            if words:
                                for word in words:  # word,word2 word word2 index?
                                    # if same words, take the later one? or there should be a check
                                    if (w == word) and (org_idx >= i) and (
                                            org_idx >= pointer_orgseg
                                    ) and (
                                            sym_seq is None
                                    ):  # this is not good, need to clean this orginal segment again, make that into a helper method.
                                        sym_seq = org_idx + 1
                                        # print(sym_seq)
                                        pointer_orgseg = org_idx
                        # check for var: REL
                        rel = False
                        if i + 1 < len(
                                word_annotation):  # make sure next word exist
                            w_next = word_annotation[i + 1]
                            if "[REL]" in w_next:
                                rel = True
                                if "wo" in w:
                                    rel_var = " RELd"
                                elif "als" in w or w.startswith(
                                        "d") or w.startswith(
                                            "wel") or w.startswith("jed"):
                                    rel_var = " RELs"
                                elif ("was" in w) or ("wie" in w) or ("wer"
                                                                      in w):
                                    rel_var = " RLOs"
                                else:
                                    rel_var = " UNK"
                        # regular ddm tagging
                        std_list = set()
                        ddm_list = set()
                        pos_list = set()
                        no_match = True

                        for p in variant_match.keys(
                        ):  # could make it into a seperate method
                            if any("IRV" in d for d in ddm_list):
                                # print(" ".join(ddm_list))
                                break
                            if p.search(w) is not None:  # .lower()
                                no_match = False
                                replace = True
                                for values in variant_match[p]:
                                    w_var = values[0].replace(
                                        "*", "")  # word variant
                                    w_std = values[1].replace(
                                        "*", "")  # word standard
                                    if std_list:
                                        tmp_std = set()
                                        while std_list:
                                            s = std_list.pop()
                                            if p.search(s) is not None:
                                                if replace:
                                                    std = s.replace(
                                                        w_var, w_std)
                                                else:
                                                    std = values[1]
                                                tmp_std.add(std)
                                            else:
                                                tmp_std.add(s)
                                        std_list.update(tmp_std)
                                    else:
                                        if replace:
                                            std = w.replace(w_var, w_std)
                                        else:
                                            std = values[1]
                                        std_list.add(std)
                                    if isinstance(values[2],
                                                  float) and math.isnan(
                                                      values[2]
                                                  ):  #check for empty var_code
                                        ddm_list.add(' ')  # do nothing
                                    else:
                                        ddm_list.add(
                                            values[2])  # should be set
                                    # another check for the lex table
                                    # or change the lex table method when reading just ignore the bad word_vars
                                    pos_list.add(values[3])

                        if no_match:
                            standard = w
                            ddm = " "
                            pos = pos_tagger.tag([w])[0][1]
                        else:
                            standard = " ".join(std_list)
                            if len(std_list) > 1:
                                print(w, "std: ", standard)
                            ddm = " ".join(str(d) for d in ddm_list)
                            # maybe here is the problem
                            if any("SAF5" in d for d in ddm_list):
                                # print(ddm) # right, this
                                for g_pattern in gehen_variants:
                                    if g_pattern.search(w) is not None:
                                        ddm = ddm.replace("SAF5d", "")
                                        ddm = ddm.replace("SAF5s", "")
                            pos = " ".join(str(p) for p in pos_list)
                        if rel:
                            ddm = ddm + rel_var
                            ddm = ddm.strip()
                    output = [
                        file_name[file_name.rfind("_") + 1:-9], w, ddm,
                        standard, pos, beg_hms, sym_seq
                    ]
                    # print(output)
                    outputs.append(output)

            outputs = skip_by_tags(outputs, 'r')
            outputs = skip_by_tags(outputs, 'wl')
            outputs = skip_by_tags(outputs, 'wg')
            word_list1_start = [
                "Finger", "Flüge", "Biene", "Hunger", "immer", "Äpfel",
                "Apfel", "Asche", "zum", "waschen"
            ]
            word_list1_end = [
                "laufen", "Frage", "Linde", "meist", "Haar", "Huhn", "Türe",
                "Kinder", "alle", "Gast"
            ]
            word_list2_start = [
                "Flüge", "Fliege", "Söhne", "Sehne", "können", "kennen",
                "Türe", "Tiere", "vermissen", "vermessen"
            ]
            word_list2_end = [
                "heiter", "heute", "Feuer", "feiern", "Ofen", "oben", "Kreide",
                "Kreuze", "Magen", "sagen"
            ]
            ft_1_start = [
                "Vor", "Zeiten", "war", "ein", "König", "und", "eine",
                "Königin", "die", "sprachen"
            ]
            ft_1_end = [
                "alte", "Frau", "mit", "einer", "Spindel", "und", "spann",
                "emsig", "ihren", "Flachs"
            ]
            ft_2_start = [
                "Es", "war", "einmal", "eine", "alte", "Geiß", "die", "hatte",
                "sieben", "junge"
            ]
            ft_2_end = [
                "er", "in", "seinen", "Rachen", "nur", "das", "jüngste",
                "fand", "er", "nicht"
            ]
            ft_3_start = [
                "In", "den", "alten", "Zeiten", "wo", "das", "Wünschen",
                "noch", "geholfen", "hat"
            ]
            ft_3_end = [
                "bei", "seinesgleichen", "und", "quakt", "und", "kann",
                "keines", "Menschen", "Geselle", "sein"
            ]
            outputs = skip_word_list(outputs, word_list1_start, word_list1_end,
                                     'wl')
            outputs = skip_word_list(outputs, word_list2_start, word_list2_end,
                                     'wl')
            outputs = skip_word_list(outputs, ft_1_start, ft_1_end, 'ft')
            outputs = skip_word_list(outputs, ft_2_start, ft_2_end, 'ft')
            outputs = skip_word_list(outputs, ft_3_start, ft_3_end, 'ft')
            for output in outputs:
                append_to_word_extract(*output)
Exemple #4
0
    def read_from_textgrid (self, file_list):
        pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
        table = str.maketrans(dict.fromkeys(string.punctuation.replace("[\\]","")))
        variant_match = dict()
        for r in zip(self.lex_table['word_variant'], self.lex_table['word_standard'], self.lex_table['word_vars'],
                     self.lex_table['POS_tag'], self.lex_table['word_lemma'], self.lex_table['word_stem']):
            # dict with variant as key.
            # if no match tag the thing
            v_pattern = compile_pattern(r[0], r[2])
            if v_pattern not in variant_match.keys():
                variant_match[v_pattern] = []
            # else:
            #     print(v_pattern)
            variant_match[v_pattern].append(r)

        gehen_variants = set()
        locations = self.lex_table.loc[self.lex_table['word_lemma'] == 'gehen']
        for gehen_var in zip(locations['word_variant'], locations['word_vars']):
            if "SAF5" not in gehen_var[1]:
                g_pattern = compile_pattern(gehen_var[0], gehen_var[1])
                gehen_variants.add(g_pattern)

        words_h = []
        skip_begin = False
        skip_begin_tag = ""
        skip_end_file = ""
        for each_file_name in file_list:
            original_words = read_txt(self.rootpath, each_file_name)
            context = []
            rel = False

            tag_pattern = re.compile("\[[^\[\]]*\]")
            # collect all the tags
            tags = []
            for i, ow in enumerate(original_words):
                find_tag = re.search(tag_pattern, ow)
                # there could be more than one [REL] tag in there.  S016-17-I-1-Manni_692. ['der', '[REL]', 'halt', 'e', 'groß---', '-eh-', 'dessen', '[REL', 'Garten', 'groß']
                if find_tag:
                    tag = find_tag.group(0)
                    tags.append(tag)
                    # print(tag)
                elif "[" in ow or "]" in ow:
                    print("incorrect tag in:", each_file_name)
                    print(ow)
                    print(original_words)
            if tags:
                for tag in tags:
                    if tag == '[REL]':
                        rel = True
                        context.append(original_words[i-1].translate(table))
                    elif tag in self.tags.keys():
                        print(each_file_name)
                        print("Skipping:", tag)
                        skip_begin = True
                        skip_begin_tag = tag
                    elif tag in self.tags.values():
                        if tag == self.tags[skip_begin_tag]:
                            print(each_file_name)
                            print("Skipping:", tag)
                            skip_begin = False  # this will not skip the file which contains the end tag
                            skip_end_file = each_file_name  # skip the file that contains the end tag
                        else:
                            print("Wrong end tag:", tag)
                    # maybe should skip before the Aligner. Just have one that operates on TextGrid and WAV then no skipping in extract
            if skip_begin:
                print("Skipping:", original_words)
                continue
            if each_file_name == skip_end_file:  #
                print("Skipping:", original_words)
                continue
            # print("filename:", each_file_name)
            interval_num = 0

            file_path = self.rootpath + each_file_name
            try:
                file_textgrid_obj = textgrid.TextGrid.fromFile(file_path)
            except ValueError:
                print(each_file_name +'value error has occured')
                os.rename(self.rootpath + +each_file_name, working_directory + 'valueError/' + each_file_name)
                continue
            tier_list = file_textgrid_obj.tiers

            for each_tier in tier_list:
                if each_tier.name == 'words':
                    tier_words = each_tier
                    intervals_words = tier_words.intervals
                elif each_tier.name == 'segments':
                    tier_segments = each_tier
                    intervals_segments = tier_segments.intervals

            count = 0
            current_minTime = 0
            seg_num = 1
            diphthong_num = 0
            diphthong_dict = {'a͜i': {'ua', 'ai', 'êi', 'ei', 'âi', 'aî', 'ãi'}, 'a͜u': {'au', 'âu'}, 'ɔ͜y': {'ôi', 'eu', 'äu', 'oi', 'êu', 'eü', 'oî'}}
            # print(each_file_name)
            try:
                for i, each_word in enumerate(intervals_words):
                    add_rel = False

                    word_start_time = each_word.minTime
                    word_end_time = each_word.maxTime
                    word_mark = each_word.mark
                    if word_mark not in original_words and "<" not in word_mark:
                        match = [ow.translate(table) for ow in original_words if word_mark == clean_word(ow)]
                        if not match:
                            words_h.append((word_mark, original_words, each_file_name))
                            continue  # some words just turned to h. for unknown reason
                            # investigate
                        else:
                            word_mark = match[0].replace("[ge]", "")
                    if rel:
                        if word_mark == context[0] or word_mark == clean_word(context[0]):
                            add_rel = True  # maybe not do it here is better
                            rel = False  # avoid
                            if "wo" in word_mark:
                                rel_var = " RELd"
                            elif "als" in word_mark or word_mark.startswith("d") or word_mark.startswith("wel") or word_mark.startswith("jed"):
                                rel_var = " RELs"
                            elif ("was" in word_mark) or ("wie" in word_mark) or ("wer" in word_mark):
                                rel_var = " RLOs"
                            else:
                                rel_var = " UNK"

                    std_list = set()
                    ddm_list = set()
                    pos_list = set()
                    lemma_list = set()
                    stem_list = set()
                    no_match = True

                    for p in variant_match.keys():
                        if p.search(word_mark) is not None:
                            if any("IRV" in d for d in ddm_list):
                                # print(" ".join(ddm_list))
                                break
                            no_match = False
                            replace = True
                            for values in variant_match[p]:
                                if "*" in values[0] and "*" not in values[1]:
                                    replace = False
                                w_var = values[0].replace("*", "")  # word variant
                                w_std = values[1].replace("*", "")  # word standard
                                if std_list:
                                    tmp_std = set()
                                    while std_list:
                                        s = std_list.pop()
                                        if p.search(s) is not None:
                                            if replace:
                                                std = s.replace(w_var, w_std)
                                            else:
                                                std = values[1]
                                            tmp_std.add(std)
                                        else:
                                            tmp_std.add(s)
                                    std_list.update(tmp_std)
                                else:
                                    if replace:
                                        std = word_mark.replace(w_var, w_std)
                                    else:
                                        std = values[1]
                                    std_list.add(std)
                                lemma = values[4]
                                stem = values[5]
                                lemma_list.add(lemma)
                                stem_list.add(stem)
                                # if "SAF5" in values[2]:
                                #     print(word_mark)
                                #     if "ge"
                                if isinstance(values[2], float) and math.isnan(values[2]):  # check for empty var_code
                                    ddm_list.add(' ')  # do nothing
                                else:
                                    ddm_list.add(values[2])  # should be set
                                pos_list.add(values[3])
                    if no_match:
                        word_german = word_mark
                        var_code = " "
                        pos_tag = pos_tagger.tag([word_german])[0][1]
                        word_lemma = word_german
                        word_stem = " "
                    else:
                        var_code = " ".join(str(d) for d in ddm_list)
                        if any("SAF5" in d for d in ddm_list):
                            for g_pattern in gehen_variants:
                                if g_pattern.search(word_mark) is not None:
                                    var_code = var_code.replace("SAF5d", "")
                                    var_code = var_code.replace("SAF5s", "")
                        word_german = " ".join(std_list)
                        if len(std_list) > 1:
                            print(word_mark, "std: ", word_german)
                        word_lemma = " ".join(lemma_list)
                        word_stem = " ".join(stem_list)
                        pos_tag = " ".join(str(p) for p in pos_list)
                    if add_rel:
                        var_code = var_code + rel_var
                        var_code = var_code.strip()
                    try:
                        vowel_orthography = find_two_vowel(word_mark)
                        while (intervals_segments[interval_num].minTime >= word_start_time) & \
                                (intervals_segments[interval_num].maxTime <= word_end_time):
                            segment_start_time = intervals_segments[interval_num].minTime
                            segment_end_time = intervals_segments[interval_num].maxTime
                            segment_mark = intervals_segments[interval_num].mark

                            diphthong_orthography = " "
                            if len(segment_mark) == 3 and "_" not in segment_mark and "ː" not in segment_mark:
                                # print(segment_mark)
                                # print(word_mark)
                                if vowel_orthography[diphthong_num].lower() in diphthong_dict[segment_mark]:
                                    diphthong_orthography = vowel_orthography[diphthong_num]
                                elif any(vow_bigram.lower() in diphthong_dict[segment_mark] for vow_bigram in vowel_orthography):
                                    for vow_bigram in vowel_orthography:
                                        if vow_bigram.lower() in diphthong_dict[segment_mark]:
                                            diphthong_orthography = vow_bigram
                                else:
                                    print(vowel_orthography)
                                    print(vowel_orthography[diphthong_num])
                                    print(word_mark)
                                    print(segment_mark)
                                diphthong_num += 1
                            if word_start_time > current_minTime:
                                seg_num = 1
                                diphthong_num = 0
                                current_minTime = word_start_time

                            output_flag = False
                            if word_mark not in original_words:
                                match_ow = [ow for ow in original_words if word_mark == clean_word(ow)]
                                if match_ow:
                                    word_original = match_ow[0]
                                    if word_filter(word_original)[0]:
                                        output_flag = True
                            else:
                                output_flag = True
                            if var_code.strip():  # quick fix
                                output_flag = True
                            if output_flag:
                                self.output_as_csv(each_file_name[:-9], word_start_time, word_end_time, word_mark,
                                                   seg_num, segment_start_time, segment_end_time, segment_mark,
                                                   diphthong_orthography, var_code, word_german, word_lemma, word_stem,
                                                   pos_tag)
                            else:
                                if "<" not in word_mark:
                                    print("not a word: ", each_file_name[:-9], word_start_time, word_end_time, word_mark, var_code, word_german)
                            seg_num += 1
                            interval_num += 1
                    except IndexError:
                        interval_num = 0
                    if word_mark != '<P>':
                        count += 1

            except AttributeError as e:
                print(each_file_name+': tier words is empty or does not exist ')
                traceback.print_tb(e.__traceback__)
        with open('words_tran_error.txt', mode='w', newline="\n") as f:
            for item in words_h:
                f.write(str(item) + "\n")