def __init__(self, corpus_dir, output_dir):
     self.corpus_dir = corpus_dir
     self.noun_out_path = os.path.join(output_dir, 'nouns.csv')
     self.np_out_path = os.path.join(output_dir, 'noun_phrases.csv')
     self.parser = CoreNLPParser(url='http://localhost:9000')
     self.n_count = {}
     self.np_count = {}
Exemple #2
0
        "quels","qui","sa","sans","ses","si","sien","son","sont","sous","sur","ta","tandis","tellement","tels","tes","ton","tous",
        "tout","trop","très","tu","votre","vous","vu","ça","sa", "son", "ses", "de", "a"]

en_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
                "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", 
                "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", 
                "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", 
                "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
                "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", 
                "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", 
                "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further",
                "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", 
                "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", 
                "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "even"]

pos_tagger = CoreNLPParser('http://localhost:9004', tagtype='pos')


class PairOfEntitiesFeatures():
    """
    For a given pair of entities in a sentence, find the features between them
    Features for now include :
            - surface form entity 1
            - surface form entity 2
            - type entity 1 (PER, ORG, LOC...)
            - type entity 2 (PER, ORG, LOC...)
            - words between entites
            - x words before the entity 1
            - x words after entity 2
            - shortest dependency path between two entities
Exemple #3
0
def create_word_csv(speaker_paths, word_extract_path, lex_table,
                    filename_annotation_map, file_timeseg_map):
    variant_match = dict()
    pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
    # create csv when there is no csv files
    if not os.path.exists(
            word_extract_path):  # if the csv does not exist, create the csv
        with open(word_extract_path, 'w', newline="") as word_extrac_csv:
            csv_writer = csv.writer(word_extrac_csv,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow([
                'trans_id', 'beg_hms', 'sym_seq', 'Word_SWG', 'var_code',
                'Word_German', 'POS_tag'
            ])
        word_extrac_csv.close()
    for r in zip(lex_table['word_variant'], lex_table['word_standard'],
                 lex_table['word_vars'], lex_table['POS_tag']):
        # dict with variant as key.
        # if no match tag the thing
        v_pattern = compile_pattern(r[0], r[2])
        if v_pattern not in variant_match.keys():
            variant_match[v_pattern] = []

        variant_match[v_pattern].append(r)
    for w_var, w_val in variant_match.items():
        if len(w_val) > 1:
            print(w_var, w_val)
    # check if the word's lemma is gehen. If it is, then don't tag the word as SAF5
    gehen_variants = set()
    locations = lex_table.loc[lex_table['word_lemma'] == 'gehen']
    for gehen_var in zip(locations['word_variant'], locations['word_vars']):
        if "SAF5" not in gehen_var[1]:
            g_pattern = compile_pattern(gehen_var[0], gehen_var[1])
            gehen_variants.add(g_pattern)

    # get speaker file names
    for speaker in speaker_paths:
        file_list = [
            file for file in os.listdir(speaker) if file.endswith('.TextGrid')
        ]
        for file_name in file_list:
            outputs = []
            annotations = filename_annotation_map[file_name]
            time_seg_map = file_time_seg_map[file_name]
            # now time stamps and word_count
            for word_annotation in annotations:
                beg_hms = word_annotation[-1]
                word_annotation = word_annotation[:-1]
                original_segment = time_seg_map[beg_hms]
                pointer_orgseg = 0
                for i, w in enumerate(word_annotation):
                    if w:  # empty word check
                        # print(w)
                        sym_seq = None
                        for org_idx, t in enumerate(
                                original_segment
                        ):  # this is for the word count
                            if sym_seq is not None:
                                break
                            words = word_filter(t)
                            if words:
                                for word in words:  # word,word2 word word2 index?
                                    # if same words, take the later one? or there should be a check
                                    if (w == word) and (org_idx >= i) and (
                                            org_idx >= pointer_orgseg
                                    ) and (
                                            sym_seq is None
                                    ):  # this is not good, need to clean this orginal segment again, make that into a helper method.
                                        sym_seq = org_idx + 1
                                        # print(sym_seq)
                                        pointer_orgseg = org_idx
                        # check for var: REL
                        rel = False
                        if i + 1 < len(
                                word_annotation):  # make sure next word exist
                            w_next = word_annotation[i + 1]
                            if "[REL]" in w_next:
                                rel = True
                                if "wo" in w:
                                    rel_var = " RELd"
                                elif "als" in w or w.startswith(
                                        "d") or w.startswith(
                                            "wel") or w.startswith("jed"):
                                    rel_var = " RELs"
                                elif ("was" in w) or ("wie" in w) or ("wer"
                                                                      in w):
                                    rel_var = " RLOs"
                                else:
                                    rel_var = " UNK"
                        # regular ddm tagging
                        std_list = set()
                        ddm_list = set()
                        pos_list = set()
                        no_match = True

                        for p in variant_match.keys(
                        ):  # could make it into a seperate method
                            if any("IRV" in d for d in ddm_list):
                                # print(" ".join(ddm_list))
                                break
                            if p.search(w) is not None:  # .lower()
                                no_match = False
                                replace = True
                                for values in variant_match[p]:
                                    w_var = values[0].replace(
                                        "*", "")  # word variant
                                    w_std = values[1].replace(
                                        "*", "")  # word standard
                                    if std_list:
                                        tmp_std = set()
                                        while std_list:
                                            s = std_list.pop()
                                            if p.search(s) is not None:
                                                if replace:
                                                    std = s.replace(
                                                        w_var, w_std)
                                                else:
                                                    std = values[1]
                                                tmp_std.add(std)
                                            else:
                                                tmp_std.add(s)
                                        std_list.update(tmp_std)
                                    else:
                                        if replace:
                                            std = w.replace(w_var, w_std)
                                        else:
                                            std = values[1]
                                        std_list.add(std)
                                    if isinstance(values[2],
                                                  float) and math.isnan(
                                                      values[2]
                                                  ):  #check for empty var_code
                                        ddm_list.add(' ')  # do nothing
                                    else:
                                        ddm_list.add(
                                            values[2])  # should be set
                                    # another check for the lex table
                                    # or change the lex table method when reading just ignore the bad word_vars
                                    pos_list.add(values[3])

                        if no_match:
                            standard = w
                            ddm = " "
                            pos = pos_tagger.tag([w])[0][1]
                        else:
                            standard = " ".join(std_list)
                            if len(std_list) > 1:
                                print(w, "std: ", standard)
                            ddm = " ".join(str(d) for d in ddm_list)
                            # maybe here is the problem
                            if any("SAF5" in d for d in ddm_list):
                                # print(ddm) # right, this
                                for g_pattern in gehen_variants:
                                    if g_pattern.search(w) is not None:
                                        ddm = ddm.replace("SAF5d", "")
                                        ddm = ddm.replace("SAF5s", "")
                            pos = " ".join(str(p) for p in pos_list)
                        if rel:
                            ddm = ddm + rel_var
                            ddm = ddm.strip()
                    output = [
                        file_name[file_name.rfind("_") + 1:-9], w, ddm,
                        standard, pos, beg_hms, sym_seq
                    ]
                    # print(output)
                    outputs.append(output)

            outputs = skip_by_tags(outputs, 'r')
            outputs = skip_by_tags(outputs, 'wl')
            outputs = skip_by_tags(outputs, 'wg')
            word_list1_start = [
                "Finger", "Flüge", "Biene", "Hunger", "immer", "Äpfel",
                "Apfel", "Asche", "zum", "waschen"
            ]
            word_list1_end = [
                "laufen", "Frage", "Linde", "meist", "Haar", "Huhn", "Türe",
                "Kinder", "alle", "Gast"
            ]
            word_list2_start = [
                "Flüge", "Fliege", "Söhne", "Sehne", "können", "kennen",
                "Türe", "Tiere", "vermissen", "vermessen"
            ]
            word_list2_end = [
                "heiter", "heute", "Feuer", "feiern", "Ofen", "oben", "Kreide",
                "Kreuze", "Magen", "sagen"
            ]
            ft_1_start = [
                "Vor", "Zeiten", "war", "ein", "König", "und", "eine",
                "Königin", "die", "sprachen"
            ]
            ft_1_end = [
                "alte", "Frau", "mit", "einer", "Spindel", "und", "spann",
                "emsig", "ihren", "Flachs"
            ]
            ft_2_start = [
                "Es", "war", "einmal", "eine", "alte", "Geiß", "die", "hatte",
                "sieben", "junge"
            ]
            ft_2_end = [
                "er", "in", "seinen", "Rachen", "nur", "das", "jüngste",
                "fand", "er", "nicht"
            ]
            ft_3_start = [
                "In", "den", "alten", "Zeiten", "wo", "das", "Wünschen",
                "noch", "geholfen", "hat"
            ]
            ft_3_end = [
                "bei", "seinesgleichen", "und", "quakt", "und", "kann",
                "keines", "Menschen", "Geselle", "sein"
            ]
            outputs = skip_word_list(outputs, word_list1_start, word_list1_end,
                                     'wl')
            outputs = skip_word_list(outputs, word_list2_start, word_list2_end,
                                     'wl')
            outputs = skip_word_list(outputs, ft_1_start, ft_1_end, 'ft')
            outputs = skip_word_list(outputs, ft_2_start, ft_2_end, 'ft')
            outputs = skip_word_list(outputs, ft_3_start, ft_3_end, 'ft')
            for output in outputs:
                append_to_word_extract(*output)
class np_extractor:
    def __init__(self, corpus_dir, output_dir):
        self.corpus_dir = corpus_dir
        self.noun_out_path = os.path.join(output_dir, 'nouns.csv')
        self.np_out_path = os.path.join(output_dir, 'noun_phrases.csv')
        self.parser = CoreNLPParser(url='http://localhost:9000')
        self.n_count = {}
        self.np_count = {}

    def get_parse_tree(self, sentence):
        return next(self.parser.raw_parse(sentence))

    def get_nps(self, tree):
        nps = []
        np_trees = list(tree.subtrees(filter=lambda x: x.label() == 'NP'))
        for np_tree in np_trees:
            np_str = " ".join(np_tree.leaves())
            np_str = re.sub("^[\d\s]+", "", np_str)
            np_str = re.sub("[^a-zA-Z\s]+", "", np_str)
            np_str = np_str.lower()
            if len(np_str) > 2:
                nps.append(np_str)
        return nps

    def get_Nouns(self, tree):
        nouns = []
        for tag in tree.pos():
            if tag[1] == "NN" and len(tag[0]) > 2:
                nouns.append(tag[0].lower())
        return nouns

    def extract(self, sentence):
        try:
            tree = self.get_parse_tree(sentence)
            nps = self.get_nps(tree)
            nouns = self.get_Nouns(tree)
            return nps, nouns
        except StopIteration:
            return [], []

    def process_corpus(self):
        assert os.path.isdir(self.corpus_dir)
        for root, sub, files in os.walk(self.corpus_dir):
            for file in files:
                file_path = os.path.join(root, file)
                print("processing {}".format(file))
                with open(file_path, encoding='utf8', errors="ignore") as fin:
                    for line in fin:
                        nps, nouns = self.extract(line)
                        for np in nps:
                            if np not in self.np_count:
                                self.np_count[np] = 0
                            self.np_count[np] += 1
                        for noun in nouns:
                            if noun not in self.n_count:
                                self.n_count[noun] = 0
                            self.n_count[noun] += 1
        sort_nouns = sorted(self.n_count.items(),
                            key=lambda x: x[1],
                            reverse=True)
        sort_nps = sorted(self.np_count.items(),
                          key=lambda x: x[1],
                          reverse=True)

        with open(self.noun_out_path, 'w') as fout:
            for (noun, count) in sort_nouns:
                fout.write("{},{}\n".format(noun, count))

        with open(self.np_out_path, 'w') as fout:
            for (np, count) in sort_nps:
                fout.write("{},{}\n".format(np, count))

    def start_standford(self):
        stanforNLP_server_cmd = " java -mx4g -cp * edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse,depparse  -status_port 9000 -port 9000 -timeout 15000"
        self.start_server = Popen(
            stanforNLP_server_cmd.split(),
            cwd="G:\lib\stanford-corenlp-full-2016-10-31",
            stderr=PIPE,
            stdout=PIPE,
            shell=True)
        while (True):
            line = str(self.start_server.stderr.readline())
            print(line)
            success_mark = 'StanfordCoreNLPServer listening at'
            except_mark = 'Address already in use'
            if success_mark in line:
                print("server started...")
                break
            elif except_mark in line:
                print("server already started or port occupied...")
                break
        self.start_server.stderr.close()
        self.start_server.stdout.close()
Exemple #5
0
from colorama import init, Fore, Back, Style
from nltk import CoreNLPParser
init()

# Read out input file to get the list of users
input_data = []
user_data = {}

INPUT_FILE = './ios_vs_android.csv'
INPUT_DIR = './crawled_data'
OUTPUT_DIR = './nlp_data'

NLP_URL = 'http://nlp:9000'

# Set up the parser beforehand
parser = CoreNLPParser(url=NLP_URL, encoding='utf8', tagtype='pos')

# Kind of lines we would like to ignore
IGNORE_LIST = [
    u'&nbsp;',
    u'**%100$ percent dollars**',
    u'Don’t waste your money. I spent $10 for this feature and only works %10 of the time, usually just breaks the page loading at all requiring whitelisting instead.',
    u'She’s using a 4S??? Get her the phone. %100',
    u'Edit: also worth to note, if you choose “don’t trust” when plugging in it will only charge at .5 because it doesn’t report itself as an iPhone, but that should still be way faster than %10 in three hours',
    u'It was horrendously buggy for me. I never had an issue with the actual function of sync seemed to keep my stuff in order between my iPad and iPhone, but man I could immeaditly tell when the sync started because my CPU would ramp up to %100 and my battery would start draining like crazy, and the messages app was so laggy I couldn’t even type, and would be like this for like 15+ minutes.',
    u'[I got the mandelbrot.](https://anvaka.github.io/pplay/?tx=0&ty=0&scale=1&fc=vec4%20get_color%28vec2%10p%29%20%7B%0A%20%20float%20t%20%3D%200.%3B%0A%20%20vec2%20z%20%3D%20p%3B%0A%20%20vec2%20c%20%3D%20vec2%280.60891%2C%200.89098%29%3B%0A%20%20float%20frames%20%3D%20600.%3B%0A%20%20float%20a%20%3D%203.14*%202.%20*%20bease%28mod%28iFrame%2C%20frames%29%2Fframes%29%3B%0A%0A%0A%20%20for%28int%20i%20%3D%200%3B%20i%20%3C%2032%3B%20%2B%2Bi%29%20%7B%0A%20%20%20%20if%20%28length%28z%29%20%3E%202.%29%20break%3B%0A%20%20%20%20z%20%3D%20c_mul%28c_exp%28z%29%20*%20sin%28a%29%2C%20z%29%20%2B%20c%3B%0A%20%20%20%20t%20%3D%20float%28i%29%3B%0A%20%20%7D%0A%0A%20%20return%20vec4%28length%28z%29%20*%20t%20*%20vec3%281.%2F64.%2C%201.%2F32.%2C%201.%2F16.%29%2C%201.0%29%3B%0A%7D)',
]


def user_id_to_input_filename(user_id):
    ''' Construct a file name from user_id
Exemple #6
0
from nltk import pos_tag
from nltk import CoreNLPParser

text = 'The foods are eaten'.split()
print(pos_tag(text))

parser = CoreNLPParser()
print(parser.parse(text))
Exemple #7
0
    def read_from_textgrid (self, file_list):
        pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
        table = str.maketrans(dict.fromkeys(string.punctuation.replace("[\\]","")))
        variant_match = dict()
        for r in zip(self.lex_table['word_variant'], self.lex_table['word_standard'], self.lex_table['word_vars'],
                     self.lex_table['POS_tag'], self.lex_table['word_lemma'], self.lex_table['word_stem']):
            # dict with variant as key.
            # if no match tag the thing
            v_pattern = compile_pattern(r[0], r[2])
            if v_pattern not in variant_match.keys():
                variant_match[v_pattern] = []
            # else:
            #     print(v_pattern)
            variant_match[v_pattern].append(r)

        gehen_variants = set()
        locations = self.lex_table.loc[self.lex_table['word_lemma'] == 'gehen']
        for gehen_var in zip(locations['word_variant'], locations['word_vars']):
            if "SAF5" not in gehen_var[1]:
                g_pattern = compile_pattern(gehen_var[0], gehen_var[1])
                gehen_variants.add(g_pattern)

        words_h = []
        skip_begin = False
        skip_begin_tag = ""
        skip_end_file = ""
        for each_file_name in file_list:
            original_words = read_txt(self.rootpath, each_file_name)
            context = []
            rel = False

            tag_pattern = re.compile("\[[^\[\]]*\]")
            # collect all the tags
            tags = []
            for i, ow in enumerate(original_words):
                find_tag = re.search(tag_pattern, ow)
                # there could be more than one [REL] tag in there.  S016-17-I-1-Manni_692. ['der', '[REL]', 'halt', 'e', 'groß---', '-eh-', 'dessen', '[REL', 'Garten', 'groß']
                if find_tag:
                    tag = find_tag.group(0)
                    tags.append(tag)
                    # print(tag)
                elif "[" in ow or "]" in ow:
                    print("incorrect tag in:", each_file_name)
                    print(ow)
                    print(original_words)
            if tags:
                for tag in tags:
                    if tag == '[REL]':
                        rel = True
                        context.append(original_words[i-1].translate(table))
                    elif tag in self.tags.keys():
                        print(each_file_name)
                        print("Skipping:", tag)
                        skip_begin = True
                        skip_begin_tag = tag
                    elif tag in self.tags.values():
                        if tag == self.tags[skip_begin_tag]:
                            print(each_file_name)
                            print("Skipping:", tag)
                            skip_begin = False  # this will not skip the file which contains the end tag
                            skip_end_file = each_file_name  # skip the file that contains the end tag
                        else:
                            print("Wrong end tag:", tag)
                    # maybe should skip before the Aligner. Just have one that operates on TextGrid and WAV then no skipping in extract
            if skip_begin:
                print("Skipping:", original_words)
                continue
            if each_file_name == skip_end_file:  #
                print("Skipping:", original_words)
                continue
            # print("filename:", each_file_name)
            interval_num = 0

            file_path = self.rootpath + each_file_name
            try:
                file_textgrid_obj = textgrid.TextGrid.fromFile(file_path)
            except ValueError:
                print(each_file_name +'value error has occured')
                os.rename(self.rootpath + +each_file_name, working_directory + 'valueError/' + each_file_name)
                continue
            tier_list = file_textgrid_obj.tiers

            for each_tier in tier_list:
                if each_tier.name == 'words':
                    tier_words = each_tier
                    intervals_words = tier_words.intervals
                elif each_tier.name == 'segments':
                    tier_segments = each_tier
                    intervals_segments = tier_segments.intervals

            count = 0
            current_minTime = 0
            seg_num = 1
            diphthong_num = 0
            diphthong_dict = {'a͜i': {'ua', 'ai', 'êi', 'ei', 'âi', 'aî', 'ãi'}, 'a͜u': {'au', 'âu'}, 'ɔ͜y': {'ôi', 'eu', 'äu', 'oi', 'êu', 'eü', 'oî'}}
            # print(each_file_name)
            try:
                for i, each_word in enumerate(intervals_words):
                    add_rel = False

                    word_start_time = each_word.minTime
                    word_end_time = each_word.maxTime
                    word_mark = each_word.mark
                    if word_mark not in original_words and "<" not in word_mark:
                        match = [ow.translate(table) for ow in original_words if word_mark == clean_word(ow)]
                        if not match:
                            words_h.append((word_mark, original_words, each_file_name))
                            continue  # some words just turned to h. for unknown reason
                            # investigate
                        else:
                            word_mark = match[0].replace("[ge]", "")
                    if rel:
                        if word_mark == context[0] or word_mark == clean_word(context[0]):
                            add_rel = True  # maybe not do it here is better
                            rel = False  # avoid
                            if "wo" in word_mark:
                                rel_var = " RELd"
                            elif "als" in word_mark or word_mark.startswith("d") or word_mark.startswith("wel") or word_mark.startswith("jed"):
                                rel_var = " RELs"
                            elif ("was" in word_mark) or ("wie" in word_mark) or ("wer" in word_mark):
                                rel_var = " RLOs"
                            else:
                                rel_var = " UNK"

                    std_list = set()
                    ddm_list = set()
                    pos_list = set()
                    lemma_list = set()
                    stem_list = set()
                    no_match = True

                    for p in variant_match.keys():
                        if p.search(word_mark) is not None:
                            if any("IRV" in d for d in ddm_list):
                                # print(" ".join(ddm_list))
                                break
                            no_match = False
                            replace = True
                            for values in variant_match[p]:
                                if "*" in values[0] and "*" not in values[1]:
                                    replace = False
                                w_var = values[0].replace("*", "")  # word variant
                                w_std = values[1].replace("*", "")  # word standard
                                if std_list:
                                    tmp_std = set()
                                    while std_list:
                                        s = std_list.pop()
                                        if p.search(s) is not None:
                                            if replace:
                                                std = s.replace(w_var, w_std)
                                            else:
                                                std = values[1]
                                            tmp_std.add(std)
                                        else:
                                            tmp_std.add(s)
                                    std_list.update(tmp_std)
                                else:
                                    if replace:
                                        std = word_mark.replace(w_var, w_std)
                                    else:
                                        std = values[1]
                                    std_list.add(std)
                                lemma = values[4]
                                stem = values[5]
                                lemma_list.add(lemma)
                                stem_list.add(stem)
                                # if "SAF5" in values[2]:
                                #     print(word_mark)
                                #     if "ge"
                                if isinstance(values[2], float) and math.isnan(values[2]):  # check for empty var_code
                                    ddm_list.add(' ')  # do nothing
                                else:
                                    ddm_list.add(values[2])  # should be set
                                pos_list.add(values[3])
                    if no_match:
                        word_german = word_mark
                        var_code = " "
                        pos_tag = pos_tagger.tag([word_german])[0][1]
                        word_lemma = word_german
                        word_stem = " "
                    else:
                        var_code = " ".join(str(d) for d in ddm_list)
                        if any("SAF5" in d for d in ddm_list):
                            for g_pattern in gehen_variants:
                                if g_pattern.search(word_mark) is not None:
                                    var_code = var_code.replace("SAF5d", "")
                                    var_code = var_code.replace("SAF5s", "")
                        word_german = " ".join(std_list)
                        if len(std_list) > 1:
                            print(word_mark, "std: ", word_german)
                        word_lemma = " ".join(lemma_list)
                        word_stem = " ".join(stem_list)
                        pos_tag = " ".join(str(p) for p in pos_list)
                    if add_rel:
                        var_code = var_code + rel_var
                        var_code = var_code.strip()
                    try:
                        vowel_orthography = find_two_vowel(word_mark)
                        while (intervals_segments[interval_num].minTime >= word_start_time) & \
                                (intervals_segments[interval_num].maxTime <= word_end_time):
                            segment_start_time = intervals_segments[interval_num].minTime
                            segment_end_time = intervals_segments[interval_num].maxTime
                            segment_mark = intervals_segments[interval_num].mark

                            diphthong_orthography = " "
                            if len(segment_mark) == 3 and "_" not in segment_mark and "ː" not in segment_mark:
                                # print(segment_mark)
                                # print(word_mark)
                                if vowel_orthography[diphthong_num].lower() in diphthong_dict[segment_mark]:
                                    diphthong_orthography = vowel_orthography[diphthong_num]
                                elif any(vow_bigram.lower() in diphthong_dict[segment_mark] for vow_bigram in vowel_orthography):
                                    for vow_bigram in vowel_orthography:
                                        if vow_bigram.lower() in diphthong_dict[segment_mark]:
                                            diphthong_orthography = vow_bigram
                                else:
                                    print(vowel_orthography)
                                    print(vowel_orthography[diphthong_num])
                                    print(word_mark)
                                    print(segment_mark)
                                diphthong_num += 1
                            if word_start_time > current_minTime:
                                seg_num = 1
                                diphthong_num = 0
                                current_minTime = word_start_time

                            output_flag = False
                            if word_mark not in original_words:
                                match_ow = [ow for ow in original_words if word_mark == clean_word(ow)]
                                if match_ow:
                                    word_original = match_ow[0]
                                    if word_filter(word_original)[0]:
                                        output_flag = True
                            else:
                                output_flag = True
                            if var_code.strip():  # quick fix
                                output_flag = True
                            if output_flag:
                                self.output_as_csv(each_file_name[:-9], word_start_time, word_end_time, word_mark,
                                                   seg_num, segment_start_time, segment_end_time, segment_mark,
                                                   diphthong_orthography, var_code, word_german, word_lemma, word_stem,
                                                   pos_tag)
                            else:
                                if "<" not in word_mark:
                                    print("not a word: ", each_file_name[:-9], word_start_time, word_end_time, word_mark, var_code, word_german)
                            seg_num += 1
                            interval_num += 1
                    except IndexError:
                        interval_num = 0
                    if word_mark != '<P>':
                        count += 1

            except AttributeError as e:
                print(each_file_name+': tier words is empty or does not exist ')
                traceback.print_tb(e.__traceback__)
        with open('words_tran_error.txt', mode='w', newline="\n") as f:
            for item in words_h:
                f.write(str(item) + "\n")
 def __init__(self, tokenizer=CoreNLPParser()):
     super(ByteDataPipe, self).__init__(tokenizer)
     self.tokenizer = tokenizer
     self.partten_1 = re.compile('[a-z]\.[A-Z]')
     self.partten_2 = re.compile('[a-z][A-Z]')