def lex_table_fix(lex_table, counter, lex_output_path): # need a better name for this method """this method will take the lex table and check if there are duplicated rows with different word_vars, pos tagging the word in word_standard and generating search patterns for DDM tagging.""" v_dict = collections.OrderedDict() # ordered output stem_c = dict() # maybe there is a better data structure lemma_c = dict() standard_c = dict() variant_c = dict() dict_count_list = [stem_c, lemma_c, standard_c, variant_c] # check for with open(output_file_lex, 'w', newline="") as lex_csv: csv_writer = csv.writer(lex_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([ 'word_stem', 'word_lemma', 'word_standard', 'word_variant', 'word_vars', 'word_english', 'POS_tag', 'word_MHG', 'word_stem_freq', 'word_lemma_freq', 'word_standard_freq', 'word_variant_freq' ]) lex_csv.close() for r in zip( lex_table['word_stem'], lex_table['word_lemma'], lex_table['word_standard'], lex_table['word_variant'], lex_table['word_vars'], lex_table['word_english'], lex_table['POS_tag'], lex_table['word_MHG'] ): # later there will not be a pos corr column!!! remember to change the code key = tuple( " " if isinstance(i, float) and math.isnan(i) else i for i in r[:4]) # key: word_stem, word_lemma, word_standard, word_variant value = tuple( " " if isinstance(i, float) and math.isnan(i) else i for i in r[4:]) # value: word_vars, word_english, word_MHG # is there a way to avoid repeated tagging on the same standard words? # pre-prossessing the word_vars, checking for word_vars that does not end with 's' or 'd' skip = False word_vars = value[0].split( ) # get rid of dangling whitespaces and multiple word_vars word_std = key[2] word_variant = key[3] if "*" in word_variant and "*" not in word_std: with open(date + "_*_check.txt", "a+") as file: print(key[3], key[2], value, file=file) elif "*" in word_std and "*" not in word_variant: with open(date + "_*_check.txt", "a+") as file: print(key[3], key[2], value, file=file) AIS_check_list = ["ei", "êi", "ôi"] if any(ais in key[3] for ais in AIS_check_list): if not any("AIS" in wv for wv in word_vars): with open(date + "_check_ais.txt", "a+") as file: print(key[3], value, file=file) for wv in word_vars: if not (wv.endswith("s") or wv.endswith("d")): skip = True with open(date + "_wrong_word_vars_ds.txt", "a+") as file: print(key[3], value, file=file) if skip: continue # update word_vars dictionary if key not in v_dict.keys(): v_dict[key] = [value] else: # if the key exist in dictionary. append = True for v in v_dict[ key]: # check each value to see if the value existed. if v[0] == value[ 0]: # only check if the new word_vars is the same as the existing one. append = False print(key, value) break if append: v_dict[key].append(value) for key in v_dict.keys(): # unused variables variant = key[3] # write a function for the lower part with the four things as argument then use *key. to make things easier variant_pattern = compile_pattern(variant, v_dict[key][0]) variant_count = get_count(variant_pattern, counter) # later can be changed to new_count new_count = variant_count[ 0] # later can be removed when the get count function no longer needs to return word for key_word, d in zip( list(key), dict_count_list): # stem, lemma, standard, variant count_update(key_word, d, new_count) # don't know how well this will work for k, v in v_dict.items(): stem_freq = stem_c[k[0]] lemma_freq = lemma_c[k[1]] standard_freq = standard_c[k[2]] variant_freq = variant_c[k[3]] tags = [t[0] for t in v] DDM_tag = " ".join(set(" ".join(tags).split())) if not DDM_tag: DDM_tag = " " line = (*k, DDM_tag, *v[0][1:], stem_freq, lemma_freq, standard_freq, variant_freq) append_to_lex(*line)
def read_from_textgrid(self, file_list): pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') lex_table = read_lex_table(lex_table_path) variant_match = dict() for r in zip(lex_table['word_variant'], lex_table['word_standard'], lex_table['word_vars'], lex_table['POS_tag']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] else: print(v_pattern) # add it? no variant_match[v_pattern].append(r) gehen_variants = set() locations = lex_table.loc[lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0]) gehen_variants.add(g_pattern) # for gehen_row in lex_table.loc[lex_table['word_lemma'] == 'gehen']['word_variant']: # # check the word_vars # if not any("SAF5" in wv for wv in lex_table.loc[lex_table['word_variant'] == gehen_row]['word_vars']): # g_pattern = compile_pattern(gehen_row) # gehen_variants.add(g_pattern) for each_file_name in file_list: # now combine the files of the same speakers print(each_file_name) interval_num = 0 file_path = self.tg_path + each_file_name try: file_textgrid_obj = textgrid.TextGrid.fromFile(file_path) except UnicodeDecodeError: print(each_file_name + ': the encode is weird, not utf-8 or ansi') tier_list = file_textgrid_obj.tiers for each_tier in tier_list: if each_tier.name == 'SWG': # read from swg tier tier_swg = each_tier intervals_swg = tier_swg.intervals try: clauses = [] clause_annotation = [] time_segment = dict() skip = False begin_tag = '' for each_annotation in intervals_swg: annotation_mark = each_annotation.mark beg_hms = timestamp_convert(each_annotation.minTime) if not annotation_mark.strip(): continue punct = [',', '.', '!', '?'] # maybe just . ! ? tokens = annotation_mark.split() time_segment[beg_hms] = tokens for token in tokens: if any(p in token for p in punct ): # function that turn segments into clauses if all(c in string.punctuation for c in token ): # this is for token like ... --- and ??? if not clause_annotation: time_stamp = beg_hms clause_annotation.append(token) if len( token ) > 3 or token in punct: # why do I do this again, still don't know clause_annotation.append(time_stamp) clauses.append(clause_annotation) clause_annotation = [] continue word_punct_split = re.findall( r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]", token, re.UNICODE) # separate word with punctuation for wp in word_punct_split: # maybe to split annotations into clauses if not clause_annotation: time_stamp = beg_hms clause_annotation.append(wp) if all(c in punct for c in wp): clause_annotation.append(time_stamp) clauses.append(clause_annotation) clause_annotation = [] else: if not clause_annotation: time_stamp = beg_hms clause_annotation.append(token) for cl in clauses: if '[ANT]' in cl or '[REL]' in cl: # print("clause", cl) beg_hms = cl[-1] # print("time", beg_hms) cl = cl[:-1] # print("cl", cl) if cl[0] not in time_segment[ beg_hms]: # closer remaining is the punctuation problem segment_annotation = [] for token in time_segment[beg_hms]: segment_annotation += re.findall( r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]", token, re.UNICODE) if cl[0] not in segment_annotation: print(segment_annotation) print(cl[0]) else: segment_annotation = time_segment[beg_hms] sym_seq = segment_annotation.index(cl[0]) + 1 words_std = [] ddm_tags = [] pos_sent = [] # get ddm for i, word in enumerate(cl): if word: # empty word check # match w with word_variant std_list = set() ddm_list = set() pos_list = set() no_match = True rel = False # check for var: REL if i + 1 < len( cl): # make sure next word exist w_next = cl[i + 1] if "[REL]" in w_next: rel = True if "wo" in word: rel_var = " RELd" elif "als" in word or word.startswith( "d") or word.startswith( "wel") or word.startswith( "jed"): rel_var = " RELs" elif ("was" in word) or ( "wie" in word) or ("wer" in word): rel_var = " RLOs" else: rel_var = " UNK" for p in variant_match.keys(): if p.search(word) is not None: # .lower() no_match = False for values in variant_match[p]: swg = values[0].replace("*", "") # rum[ge]draat if "ge" in swg and "ge" not in word: swg = swg.replace( "ge", "g" ) # for gespielt gspielt std = values[1].replace("*", "") std_list.add(std) if isinstance( values[2], float ) and math.isnan( values[2] ): # check for empty var_code pass # do nothing else: ddm_list.add( values[2]) # should be set if isinstance( values[3], float) and math.isnan( values[3]): pos_list.add('*') else: pos_list.add(values[3]) if no_match: standard = word ddm = "*" pos = pos_tagger.tag([word])[0][1] if "$" in pos: pos = "*" else: standard = " ".join(std_list) ddm = " ".join(str(d) for d in ddm_list) if any("SAF5" in d for d in ddm_list): for g_pattern in gehen_variants: if g_pattern.search( word) is not None: print(ddm) print(word) print( "!" ) # gegang* [ge]gang* will be taged as SAF5 # k as prefix ddm = ddm.replace("SAF5d", "") ddm = ddm.replace("SAF5s", "") print(ddm) pos = " ".join(str(p) for p in pos_list) if rel: if ddm != "*": ddm = ddm + rel_var else: ddm = rel_var ddm = ddm.strip() words_std.append(standard) ddm_tags.append(ddm) pos_sent.append(pos) # columns self.output_as_csv( each_file_name[each_file_name.rfind("_") + 1:-9], beg_hms, sym_seq, " ".join(cl), " ".join(ddm_tags), " ".join(pos_sent)) except AttributeError as e: print(each_file_name + ': tier words is empty or does not exist ') traceback.print_tb(e.__traceback__)
def create_word_csv(speaker_paths, word_extract_path, lex_table, filename_annotation_map, file_timeseg_map): variant_match = dict() pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') # create csv when there is no csv files if not os.path.exists( word_extract_path): # if the csv does not exist, create the csv with open(word_extract_path, 'w', newline="") as word_extrac_csv: csv_writer = csv.writer(word_extrac_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([ 'trans_id', 'beg_hms', 'sym_seq', 'Word_SWG', 'var_code', 'Word_German', 'POS_tag' ]) word_extrac_csv.close() for r in zip(lex_table['word_variant'], lex_table['word_standard'], lex_table['word_vars'], lex_table['POS_tag']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0], r[2]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] variant_match[v_pattern].append(r) for w_var, w_val in variant_match.items(): if len(w_val) > 1: print(w_var, w_val) # check if the word's lemma is gehen. If it is, then don't tag the word as SAF5 gehen_variants = set() locations = lex_table.loc[lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0], gehen_var[1]) gehen_variants.add(g_pattern) # get speaker file names for speaker in speaker_paths: file_list = [ file for file in os.listdir(speaker) if file.endswith('.TextGrid') ] for file_name in file_list: outputs = [] annotations = filename_annotation_map[file_name] time_seg_map = file_time_seg_map[file_name] # now time stamps and word_count for word_annotation in annotations: beg_hms = word_annotation[-1] word_annotation = word_annotation[:-1] original_segment = time_seg_map[beg_hms] pointer_orgseg = 0 for i, w in enumerate(word_annotation): if w: # empty word check # print(w) sym_seq = None for org_idx, t in enumerate( original_segment ): # this is for the word count if sym_seq is not None: break words = word_filter(t) if words: for word in words: # word,word2 word word2 index? # if same words, take the later one? or there should be a check if (w == word) and (org_idx >= i) and ( org_idx >= pointer_orgseg ) and ( sym_seq is None ): # this is not good, need to clean this orginal segment again, make that into a helper method. sym_seq = org_idx + 1 # print(sym_seq) pointer_orgseg = org_idx # check for var: REL rel = False if i + 1 < len( word_annotation): # make sure next word exist w_next = word_annotation[i + 1] if "[REL]" in w_next: rel = True if "wo" in w: rel_var = " RELd" elif "als" in w or w.startswith( "d") or w.startswith( "wel") or w.startswith("jed"): rel_var = " RELs" elif ("was" in w) or ("wie" in w) or ("wer" in w): rel_var = " RLOs" else: rel_var = " UNK" # regular ddm tagging std_list = set() ddm_list = set() pos_list = set() no_match = True for p in variant_match.keys( ): # could make it into a seperate method if any("IRV" in d for d in ddm_list): # print(" ".join(ddm_list)) break if p.search(w) is not None: # .lower() no_match = False replace = True for values in variant_match[p]: w_var = values[0].replace( "*", "") # word variant w_std = values[1].replace( "*", "") # word standard if std_list: tmp_std = set() while std_list: s = std_list.pop() if p.search(s) is not None: if replace: std = s.replace( w_var, w_std) else: std = values[1] tmp_std.add(std) else: tmp_std.add(s) std_list.update(tmp_std) else: if replace: std = w.replace(w_var, w_std) else: std = values[1] std_list.add(std) if isinstance(values[2], float) and math.isnan( values[2] ): #check for empty var_code ddm_list.add(' ') # do nothing else: ddm_list.add( values[2]) # should be set # another check for the lex table # or change the lex table method when reading just ignore the bad word_vars pos_list.add(values[3]) if no_match: standard = w ddm = " " pos = pos_tagger.tag([w])[0][1] else: standard = " ".join(std_list) if len(std_list) > 1: print(w, "std: ", standard) ddm = " ".join(str(d) for d in ddm_list) # maybe here is the problem if any("SAF5" in d for d in ddm_list): # print(ddm) # right, this for g_pattern in gehen_variants: if g_pattern.search(w) is not None: ddm = ddm.replace("SAF5d", "") ddm = ddm.replace("SAF5s", "") pos = " ".join(str(p) for p in pos_list) if rel: ddm = ddm + rel_var ddm = ddm.strip() output = [ file_name[file_name.rfind("_") + 1:-9], w, ddm, standard, pos, beg_hms, sym_seq ] # print(output) outputs.append(output) outputs = skip_by_tags(outputs, 'r') outputs = skip_by_tags(outputs, 'wl') outputs = skip_by_tags(outputs, 'wg') word_list1_start = [ "Finger", "Flüge", "Biene", "Hunger", "immer", "Äpfel", "Apfel", "Asche", "zum", "waschen" ] word_list1_end = [ "laufen", "Frage", "Linde", "meist", "Haar", "Huhn", "Türe", "Kinder", "alle", "Gast" ] word_list2_start = [ "Flüge", "Fliege", "Söhne", "Sehne", "können", "kennen", "Türe", "Tiere", "vermissen", "vermessen" ] word_list2_end = [ "heiter", "heute", "Feuer", "feiern", "Ofen", "oben", "Kreide", "Kreuze", "Magen", "sagen" ] ft_1_start = [ "Vor", "Zeiten", "war", "ein", "König", "und", "eine", "Königin", "die", "sprachen" ] ft_1_end = [ "alte", "Frau", "mit", "einer", "Spindel", "und", "spann", "emsig", "ihren", "Flachs" ] ft_2_start = [ "Es", "war", "einmal", "eine", "alte", "Geiß", "die", "hatte", "sieben", "junge" ] ft_2_end = [ "er", "in", "seinen", "Rachen", "nur", "das", "jüngste", "fand", "er", "nicht" ] ft_3_start = [ "In", "den", "alten", "Zeiten", "wo", "das", "Wünschen", "noch", "geholfen", "hat" ] ft_3_end = [ "bei", "seinesgleichen", "und", "quakt", "und", "kann", "keines", "Menschen", "Geselle", "sein" ] outputs = skip_word_list(outputs, word_list1_start, word_list1_end, 'wl') outputs = skip_word_list(outputs, word_list2_start, word_list2_end, 'wl') outputs = skip_word_list(outputs, ft_1_start, ft_1_end, 'ft') outputs = skip_word_list(outputs, ft_2_start, ft_2_end, 'ft') outputs = skip_word_list(outputs, ft_3_start, ft_3_end, 'ft') for output in outputs: append_to_word_extract(*output)
def read_from_textgrid (self, file_list): pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') table = str.maketrans(dict.fromkeys(string.punctuation.replace("[\\]",""))) variant_match = dict() for r in zip(self.lex_table['word_variant'], self.lex_table['word_standard'], self.lex_table['word_vars'], self.lex_table['POS_tag'], self.lex_table['word_lemma'], self.lex_table['word_stem']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0], r[2]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] # else: # print(v_pattern) variant_match[v_pattern].append(r) gehen_variants = set() locations = self.lex_table.loc[self.lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0], gehen_var[1]) gehen_variants.add(g_pattern) words_h = [] skip_begin = False skip_begin_tag = "" skip_end_file = "" for each_file_name in file_list: original_words = read_txt(self.rootpath, each_file_name) context = [] rel = False tag_pattern = re.compile("\[[^\[\]]*\]") # collect all the tags tags = [] for i, ow in enumerate(original_words): find_tag = re.search(tag_pattern, ow) # there could be more than one [REL] tag in there. S016-17-I-1-Manni_692. ['der', '[REL]', 'halt', 'e', 'groß---', '-eh-', 'dessen', '[REL', 'Garten', 'groß'] if find_tag: tag = find_tag.group(0) tags.append(tag) # print(tag) elif "[" in ow or "]" in ow: print("incorrect tag in:", each_file_name) print(ow) print(original_words) if tags: for tag in tags: if tag == '[REL]': rel = True context.append(original_words[i-1].translate(table)) elif tag in self.tags.keys(): print(each_file_name) print("Skipping:", tag) skip_begin = True skip_begin_tag = tag elif tag in self.tags.values(): if tag == self.tags[skip_begin_tag]: print(each_file_name) print("Skipping:", tag) skip_begin = False # this will not skip the file which contains the end tag skip_end_file = each_file_name # skip the file that contains the end tag else: print("Wrong end tag:", tag) # maybe should skip before the Aligner. Just have one that operates on TextGrid and WAV then no skipping in extract if skip_begin: print("Skipping:", original_words) continue if each_file_name == skip_end_file: # print("Skipping:", original_words) continue # print("filename:", each_file_name) interval_num = 0 file_path = self.rootpath + each_file_name try: file_textgrid_obj = textgrid.TextGrid.fromFile(file_path) except ValueError: print(each_file_name +'value error has occured') os.rename(self.rootpath + +each_file_name, working_directory + 'valueError/' + each_file_name) continue tier_list = file_textgrid_obj.tiers for each_tier in tier_list: if each_tier.name == 'words': tier_words = each_tier intervals_words = tier_words.intervals elif each_tier.name == 'segments': tier_segments = each_tier intervals_segments = tier_segments.intervals count = 0 current_minTime = 0 seg_num = 1 diphthong_num = 0 diphthong_dict = {'a͜i': {'ua', 'ai', 'êi', 'ei', 'âi', 'aî', 'ãi'}, 'a͜u': {'au', 'âu'}, 'ɔ͜y': {'ôi', 'eu', 'äu', 'oi', 'êu', 'eü', 'oî'}} # print(each_file_name) try: for i, each_word in enumerate(intervals_words): add_rel = False word_start_time = each_word.minTime word_end_time = each_word.maxTime word_mark = each_word.mark if word_mark not in original_words and "<" not in word_mark: match = [ow.translate(table) for ow in original_words if word_mark == clean_word(ow)] if not match: words_h.append((word_mark, original_words, each_file_name)) continue # some words just turned to h. for unknown reason # investigate else: word_mark = match[0].replace("[ge]", "") if rel: if word_mark == context[0] or word_mark == clean_word(context[0]): add_rel = True # maybe not do it here is better rel = False # avoid if "wo" in word_mark: rel_var = " RELd" elif "als" in word_mark or word_mark.startswith("d") or word_mark.startswith("wel") or word_mark.startswith("jed"): rel_var = " RELs" elif ("was" in word_mark) or ("wie" in word_mark) or ("wer" in word_mark): rel_var = " RLOs" else: rel_var = " UNK" std_list = set() ddm_list = set() pos_list = set() lemma_list = set() stem_list = set() no_match = True for p in variant_match.keys(): if p.search(word_mark) is not None: if any("IRV" in d for d in ddm_list): # print(" ".join(ddm_list)) break no_match = False replace = True for values in variant_match[p]: if "*" in values[0] and "*" not in values[1]: replace = False w_var = values[0].replace("*", "") # word variant w_std = values[1].replace("*", "") # word standard if std_list: tmp_std = set() while std_list: s = std_list.pop() if p.search(s) is not None: if replace: std = s.replace(w_var, w_std) else: std = values[1] tmp_std.add(std) else: tmp_std.add(s) std_list.update(tmp_std) else: if replace: std = word_mark.replace(w_var, w_std) else: std = values[1] std_list.add(std) lemma = values[4] stem = values[5] lemma_list.add(lemma) stem_list.add(stem) # if "SAF5" in values[2]: # print(word_mark) # if "ge" if isinstance(values[2], float) and math.isnan(values[2]): # check for empty var_code ddm_list.add(' ') # do nothing else: ddm_list.add(values[2]) # should be set pos_list.add(values[3]) if no_match: word_german = word_mark var_code = " " pos_tag = pos_tagger.tag([word_german])[0][1] word_lemma = word_german word_stem = " " else: var_code = " ".join(str(d) for d in ddm_list) if any("SAF5" in d for d in ddm_list): for g_pattern in gehen_variants: if g_pattern.search(word_mark) is not None: var_code = var_code.replace("SAF5d", "") var_code = var_code.replace("SAF5s", "") word_german = " ".join(std_list) if len(std_list) > 1: print(word_mark, "std: ", word_german) word_lemma = " ".join(lemma_list) word_stem = " ".join(stem_list) pos_tag = " ".join(str(p) for p in pos_list) if add_rel: var_code = var_code + rel_var var_code = var_code.strip() try: vowel_orthography = find_two_vowel(word_mark) while (intervals_segments[interval_num].minTime >= word_start_time) & \ (intervals_segments[interval_num].maxTime <= word_end_time): segment_start_time = intervals_segments[interval_num].minTime segment_end_time = intervals_segments[interval_num].maxTime segment_mark = intervals_segments[interval_num].mark diphthong_orthography = " " if len(segment_mark) == 3 and "_" not in segment_mark and "ː" not in segment_mark: # print(segment_mark) # print(word_mark) if vowel_orthography[diphthong_num].lower() in diphthong_dict[segment_mark]: diphthong_orthography = vowel_orthography[diphthong_num] elif any(vow_bigram.lower() in diphthong_dict[segment_mark] for vow_bigram in vowel_orthography): for vow_bigram in vowel_orthography: if vow_bigram.lower() in diphthong_dict[segment_mark]: diphthong_orthography = vow_bigram else: print(vowel_orthography) print(vowel_orthography[diphthong_num]) print(word_mark) print(segment_mark) diphthong_num += 1 if word_start_time > current_minTime: seg_num = 1 diphthong_num = 0 current_minTime = word_start_time output_flag = False if word_mark not in original_words: match_ow = [ow for ow in original_words if word_mark == clean_word(ow)] if match_ow: word_original = match_ow[0] if word_filter(word_original)[0]: output_flag = True else: output_flag = True if var_code.strip(): # quick fix output_flag = True if output_flag: self.output_as_csv(each_file_name[:-9], word_start_time, word_end_time, word_mark, seg_num, segment_start_time, segment_end_time, segment_mark, diphthong_orthography, var_code, word_german, word_lemma, word_stem, pos_tag) else: if "<" not in word_mark: print("not a word: ", each_file_name[:-9], word_start_time, word_end_time, word_mark, var_code, word_german) seg_num += 1 interval_num += 1 except IndexError: interval_num = 0 if word_mark != '<P>': count += 1 except AttributeError as e: print(each_file_name+': tier words is empty or does not exist ') traceback.print_tb(e.__traceback__) with open('words_tran_error.txt', mode='w', newline="\n") as f: for item in words_h: f.write(str(item) + "\n")