def make_files(): # Set analyses variable for creation of further resources # If it does not already exist, it will be created by importing modules from Reassign_POS above # If it is deleted by the code above, it will be recreated if path.exists("SG. Combined Data.xlsx"): analyses = list_xlsx("SG. Combined Data", "Sheet 1") else: print(create_data_combo()) analyses = list_xlsx("SG. Combined Data", "Sheet 1") # Create 'Clean_GlossDict.pkl' and 'Clean_WordDict.pkl' print(create_clean_glossdict()) print(create_clean_worddict()) # Create 'A1 List.pkl', 'A2 List.pkl', 'A3 List.pkl', 'Active_Passive List.pkl', # 'Relative Options List.pkl', and 'Translations List.pkl' print(save_sorted_tags(sort_tag_levels(list_tag_levels(analyses)))) # Create 'All POS Combos Used.pkl' and 'POS_taglist.pkl' print(save_all_pos_combos_list(analyses)) print(create_pos_taglist()) # Create 'Gloss_List.pkl' and 'Words_List.pkl' print(create_glosslist(analyses)) print(create_wordlist(analyses)) # Create 'SG POS-tagged combined.pkl' and 'SG POS-tagged separated.pkl' print(save_poslist(True)) print(save_poslist(False)) # Create 'sga_dipsgg-ud-test1.conllu' and 'sga_dipsgg-ud-test2.conllu' print(compile_SGG(open_obj("SG POS-tagged combined.pkl"), True)) print(compile_SGG(open_obj("SG POS-tagged separated.pkl"), False)) return ""
def create_clean_worddict(): sgData = list_xlsx("SG. Combined Data", "Sheet 1") wordlist = list() for i in sgData: thisword = i[2] if thisword: if thisword not in wordlist: wordlist.append(thisword) worddict = {} for i in wordlist: worddict[i] = clean_word(i) save_obj("Clean_WordDict", worddict) return "Created file: 'Clean_WordDict.pkl'"
def create_clean_glossdict(): sgData = list_xlsx("SG. Combined Data", "Sheet 1") glosslist = list() lastgloss = "" for i in sgData: thisgloss = i[10] if thisgloss != lastgloss: glosslist.append(thisgloss) lastgloss = thisgloss glossdict = {} for i in glosslist: glossdict[i] = clean_gloss(i) save_obj("Clean_GlossDict", glossdict) return "Created file: 'Clean_GlossDict.pkl'"
Match tokenisation of OI material in gloss-lists to words Implement appropriate spacing in gloss-lists based on tokenisation Sequence gloss-lists """ from Clean_ExcelLists import create_data_combo from Pickle import open_obj from OpenXlsx import list_xlsx from Clean_Glosses import clean_gloss, clean_word, clean_lemma from Reassign_POS import clean_analysis, clean_onetag, create_glosslist, create_wordlist try: analyses = list_xlsx("SG. Combined Data", "Sheet 1") except FileNotFoundError: print(create_data_combo()) analyses = list_xlsx("SG. Combined Data", "Sheet 1") # # Run the functions below to create the following .pkl files from spreadsheet, "SG. Combined Data" try: glosslist = open_obj("Gloss_List.pkl") wordslist = open_obj("Words_List.pkl") except FileNotFoundError: print(create_glosslist(analyses)) print(create_wordlist(analyses)) glosslist = open_obj("Gloss_List.pkl") wordslist = open_obj("Words_List.pkl") # Map a word-separated gloss from the Hofman corpus to a list of POS-tagged words from the Bauer corpus
for i in somelist: tempi = [] for j in i: try: if math.isnan(j): tempi.append(False) else: tempi.append(j) except TypeError: tempi.append(j) templist.append(tempi) return templist # Gets only required fields from gloss spreadsheet, puts them in preferable order, removes replaces NaN instances glosslist = [gloss_keeplist] + list_xlsx("glosses_full", "glosses", gloss_droptup) glosslist = [[g[0], g[1], g[3], g[2], g[5], g[4], g[6]] for g in glosslist] glosslist = clean_nan(glosslist) fix_trans_list = list() for i in glosslist: if i[4]: fix_trans_list.append(i) else: i[4] = "* no translation available *" fix_trans_list.append(i) glosslist = fix_trans_list # Gets only required fields from analysis spreadsheet, puts them in preferable order, removes replaces NaN instances wordlist = [word_keeplist] + list_xlsx("glosses_words", "words", word_droptup) wordlist = [[w[0], w[1], w[8], w[4], w[2], w[3], w[5], w[6], w[7]] for w in wordlist] wordlist = clean_nan(wordlist)