def step1(): import csv, re, sys import twol.cfg as cfg version = cfg.timestamp(__file__) import argparse argparser = argparse.ArgumentParser( "python3 paratab2segcsv.py", description="Converts a tabular csv paradigm into"\ " one example per row CSV file. Version {} ".format(version)) argparser.add_argument( "input", default="ksk-paradigms.csv", help="Paradigm table as a CSV file") argparser.add_argument( "output", default="ksk-seg-examp.csv", help="One example per row paradigm as a CSV file") argparser.add_argument( "-s", "--morph-separator", default=".", help="Boundary between the morphs in a table cell") argparser.add_argument( "-d", "--csv-delimiter", default=",", help="CSV delimiter between the two fields, default is ','") argparser.add_argument( "-n", "--name-separator", default=".", help="Separator between morpheme names"\ " in the morpheme list, default is '.'") argparser.add_argument( "-z", "--zero-symbol", default="Ø", help="Symbol to be inserted in word forms in order to"\ " align them, default is Ø. You are discouraged to change it.") args = argparser.parse_args() out_file = open(args.output, "w") writer = csv.DictWriter(out_file, ["MORPHEMES","MORPHS"], delimiter=args.csv_delimiter) writer.writeheader() d = {} morph_set = {} seg_ex_list = [] with open(args.input, "r") as csvfile: reader = csv.DictReader(csvfile, delimiter=args.csv_delimiter, skipinitialspace=True) for row in reader: if row["ID"].startswith("?"): continue # process each cell of the row for column_label, words in row.items(): if (not words) or (column_label in {"ID", "KSK"}) \ or ("STM" not in column_label): continue morpheme_list = column_label.split(args.name_separator) if morpheme_list[0] == 'STM': morpheme_list[0] = row['ID'] words_clean = re.sub(r'[][()]', '', words) word_list = re.split(r"\s+", words_clean) for morphs in word_list: if not morphs or morphs.find('*') >= 0: continue d["MORPHEMES"] = args.name_separator.join(morpheme_list).strip() d["MORPHS"] = morphs writer.writerow(d) out_file.close() return
def main(): import re import csv import argparse import twol.cfg as cfg version = cfg.timestamp(__file__) argparser = argparse.ArgumentParser( "python3 raw2named.py", description="Renames raw morphophonemes. Version {} ".format(version)) argparser.add_argument( "input", default="demo-raw.csv", help="aligned examples as a CSV file") argparser.add_argument( "output", default="demo-renamed.pstr", help="renamed examples as a space separated pair symbol strings") argparser.add_argument( "names", default="demo-renaming.csv", help="mapping from raw to neat morphophonemes as a CSV file,"\ " default is ','") argparser.add_argument( "-d", "--delimiter", default=",", help="delimiter between raw name and new name fields, default is ','") argparser.add_argument( "-n", "--name-separator", default=".", help="Separator between morpheme names in the morpheme list,"\ " default is '.'") argparser.add_argument( "-F", "--add-features", default=False, action="store_true", help="add affix morpheme names to the pairstring representation") argparser.add_argument( "-v", "--verbosity", default=0, type=int, help="level of diagnostic and debugging output") args = argparser.parse_args() import twol.cfg as cfg cfg.verbosity = args.verbosity mphon_name = { } # Read in the namefile is a CSV file which contains three fields: # 1. the raw (old) name for the mophophoneme # 2. a neat (new) name for the morphophoneme # 3. Comments documenting typical occurrences of the morphophoneme with open(args.names) as namefile: reader = csv.reader(namefile, delimiter=args.delimiter, skipinitialspace=True) for row in reader: if not row or (not row[0].strip()): continue if len(row) < 2: print("*** TOO FEW FIELDS IN:", row) continue if row[1].strip(): mphon_name[row[0].strip()] = row[1].strip() #print(mphon_name)### outfil = open(args.output, "w") with open(args.input) as csvfile: reader = csv.DictReader(csvfile, delimiter=args.delimiter, skipinitialspace=True) for row in reader: zero_filled_str = row["ZEROFILLED"].strip().replace(".", "") raw_str = row["RAW"].strip() raw_lst = raw_str.split(" ") pairsym_lst = [] if cfg.verbosity >= 20: print(row) print("raw_lst:", raw_lst) if len(raw_lst) != len(zero_filled_str): print("** LENGTHS DISAGREE **", raw_lst, zero_filled_str) continue for raw_insym, outsym in zip(raw_lst, zero_filled_str): if raw_insym == outsym: psym = raw_insym else: clean_insym = mphon_name.get(raw_insym, raw_insym) psym = clean_insym + ":" + outsym pairsym_lst.append(psym) if args.add_features: morpheme_lst = row["MORPHEMES"].strip().split(args.name_separator) for morpheme in morpheme_lst[1:]: pairsym_lst.append(morpheme + ":Ø") pairsym_str = " ".join(pairsym_lst) print(pairsym_str, file=outfil) return
def main(): import twol.cfg as cfg version = cfg.timestamp(__file__) import argparse argparser = argparse.ArgumentParser( "python3 zerofilled2raw.py", description="Forms raw morphophonemes out of zero-filled"\ " morphs and produces a space-separated pair string"\ " representation for the word suitable for"\ " twol-comp or twol-discov. Version {}".format(version)) argparser.add_argument("input", help="zero-filled example words as a CSV file") argparser.add_argument( "output", help="The output file in CSV format with a new column"\ " where the words are represented with raw"\ " morhpophonemes from zero-filling.") argparser.add_argument( "affix_info", help="Principal forms and morphophonemic affixes as a CSV file") argparser.add_argument("-d", "--csv-delimiter", default=",", help="Delimiter between the fields, default=','") argparser.add_argument( "-s", "--morph-separator", default=".", help="Separator between morphs in the word form, default='.'") argparser.add_argument( "-n", "--name-separator", default=".", help="Separator between morpheme names in the morpheme list") argparser.add_argument("-z", "--zero-symbol", default="Ø", help="Symbol inserted in word forms to align them") argparser.add_argument("-v", "--verbosity", default=0, type=int, help="level of diagnostic and debugging output") args = argparser.parse_args() import re import csv import collections principal_lst = [] """"List of principal forms or principal parts, i.e. the forms which uniquely determine the morphophonemic variations that may occur within the stem. """ feat2mphons = {} # Read in the feature combinations of principal forms and # the morphophonemic representations of affix features with open(args.affix_info, "r") as afffil: affrdr = csv.reader(afffil, delimiter=args.csv_delimiter, skipinitialspace=True) for row in affrdr: if row[1] == '+': feat = row[0] if feat not in principal_lst: principal_lst.append(feat) else: feat2mphons[row[0]] = row[1] if args.verbosity >= 10: print("principal_lst =", principal_lst) #### print("feat2mphons =", feat2mphons) #### # Read in the morpheme names and the zero-filled morphs stem_morpheme_data = collections.OrderedDict() """Indexed by stem morpheme name, value is a list of the original data for that stem morpheme. Each value consists of a tuple of fields (MORPHEMES, MORPHS, ALIGNED) in the original data. """ with open(args.input, "r") as infil: rdr = csv.DictReader(infil, delimiter=args.csv_delimiter, skipinitialspace=True) for row in rdr: names = row["MORPHEMES"].strip() orig_morphs = row["MORPHS"].strip() zerof_morphs = row["ZEROFILLED"].strip() if (not names) or (not zerof_morphs): continue name_lst = names.split(args.name_separator, maxsplit=1) stem_name = name_lst[0] form_name = ".".join(name_lst[1:]) if len(name_lst) > 1 else "" zerof_morph_lst = zerof_morphs.split(args.morph_separator, maxsplit=1) if stem_name not in stem_morpheme_data: stem_morpheme_data[stem_name] = [] stem_morpheme_data[stem_name].append( (form_name, orig_morphs, zerof_morph_lst)) ofil = open(args.output, "w") writer = csv.DictWriter( ofil, fieldnames=["MORPHEMES", "MORPHS", "ZEROFILLED", "RAW"]) writer.writeheader() for stem_morpheme, data_lst in stem_morpheme_data.items(): princ_zstem_lst = [] if args.verbosity >= 10: print("*** stem_morpheme, data_lst:", stem_morpheme, data_lst) # select the principal forms of this stem morpheme for data in data_lst: form_name, orig_morphs, zerof_morph_lst = data if form_name in principal_lst: princ_zstem_lst.append(zerof_morph_lst[0]) # form the raw morphophonemes by combining corresponding # symbols if args.verbosity >= 10: print("*** princ_zstem_lst:", princ_zstem_lst) ### lgth = len(princ_zstem_lst[0]) zstem_rawsym_lst = [] for i in range(lgth): lst = [] for princ_zstem in princ_zstem_lst: lst.append(princ_zstem[i]) # print(stem_morpheme, i, lst)### raw_seq = "".join(lst) if re.match(r"^(.)(\1)*$", raw_seq): raw_sym = raw_seq[0] # abbreviate if all identical else: raw_sym = "{" + raw_seq + "}" zstem_rawsym_lst.append(raw_sym) zstem_pairsym_str = " ".join(zstem_rawsym_lst) # Output the data augmented with the representation with raw # morphophonemes for data in data_lst: form_name, orig_morphs, zerof_morph_lst = data form_part = args.name_separator + form_name if form_name else "" row["MORPHEMES"] = (stem_morpheme + form_part).strip() row["MORPHS"] = orig_morphs orig_zerof_morphs = args.morph_separator.join(zerof_morph_lst) row["ZEROFILLED"] = orig_zerof_morphs raw_lst = [zstem_pairsym_str] feat_lst = form_name.split(args.name_separator) for feat in feat_lst: raw_lst.append(feat2mphons[feat]) row["RAW"] = " ".join(raw_lst) writer.writerow(row) return
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( "twol-discov", description="Deduces two-level rules out of"\ " a file of examples. The file must consist of"\ " lines of space-separated pair string. Such a file"\ " can be produced e.g. by twol-raw2renamed program."\ " Version {}".format(version)) arpar.add_argument("examples", help="Example pair strings file", default="test.pstr") arpar.add_argument( "-s", "--symbol", help="Input symbol for which to find rules."\ " If not given then rules are proposed for"\ " all morphophonemes in the example file", default="") arpar.add_argument( "-v", "--verbosity", help="Level of diagnostic output, default is 5. Set to"\ " 0 to omit the printing of relevant examples for the rules", type=int, default=5) args = arpar.parse_args() cfg.verbosity = args.verbosity twexamp.read_examples(filename=args.examples, build_fsts=False) if cfg.verbosity >= 10: print("--- all examples read in ---") for insym in cfg.input_symbol_set: pair_symbols_for_input[insym] = set() for insym, outsym in cfg.symbol_pair_set: pair_symbol = cfg.sympair2pairsym(insym, outsym) pair_symbols_for_input[insym].add(pair_symbol) if args.symbol: if args.symbol in pair_symbols_for_input: pair_set = pair_symbols_for_input[args.symbol] pair_lst = [] for pairsym in pair_set: insym, outsym = cfg.pairsym2sympair(pairsym) pair_lst.append((insym, outsym)) if cfg.verbosity >= 10: print("pair_lst:", pair_lst) else: print("Symbol {} not in the input alphabet of examples".format( args.symbol)) lst = [ insym for insym in pair_symbols_for_input.keys() if len(insym) > 2 ] print("The following symbols are:", " ".join(sorted(lst))) exit("") else: pair_lst = sorted(cfg.symbol_pair_set) for insym, outsym in pair_lst: if len(pair_symbols_for_input[insym]) <= 1: continue pair_symbol = cfg.sympair2pairsym(insym, outsym) posi_contexts, nega_contexts = relevant_contexts(pair_symbol) pos_contexts, neg_contexts = minimal_contexts(pair_symbol, posi_contexts.copy(), nega_contexts.copy()) if len(pos_contexts) <= len(neg_contexts) or cfg.verbosity > 0: print_rule(pair_symbol, "=>", pos_contexts) else: print_rule(pair_symbol, "/<=", neg_contexts) if args.verbosity >= 5: for lc, rc in posi_contexts: l_str = context_to_output_str(lc) r_str = context_to_output_str(rc) print("!{:>29}<{}>{}".format(l_str, outsym, r_str))
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( description="A compiler and tester for two-level rules."\ " Version {}."\ " See https://pytwolc.readthedocs.io/en/latest/index.html"\ " or https://github.com/koskenni/twol"\ " for more information.".format(version)) arpar.add_argument( "-e", "--examples", action='store', nargs='+', help="""Either one name of a FST file that contains the examples or a list of names of files which contain the PSTR form examples used for compiling the rules.""", default=[None]) arpar.add_argument( "-r", "--rules", action='store', nargs='+', help="""One or more files which contain the rules, either just one rule file or a file of defines as the first one and a part of the whole rule set as the second""", default=[None]) arpar.add_argument( "-o", "--output", help="File to which write the compiled rules if a name is given", default="") arpar.add_argument( "-l", "--lost", help="File to which write the examples"\ " that were not accepted by all rules"\ " -- it is written as a FST", default="") arpar.add_argument( "-w", "--wrong", help="file to which write the wrong strings"\ " that are accepted by all rules -- it is written as a FST", default="") arpar.add_argument( "-t", "--thorough", help="test each rule separately: 0 if no testing is desired,"\ " 1 if against positive examples," " 2 against both positive and negative examples."\ " Default is 2.", type=int, choices=[0, 1, 2], default=2) arpar.add_argument( "--recursion", help="set the limit for recursion depth", type=int) arpar.add_argument( "-v", "--verbosity", help="level of diagnostic output", type=int, default=0) args = arpar.parse_args() cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if len(args.examples) == 1 and args.examples[0].endswith(".fst"): twexamp.read_fst(args.examples[0]) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser_init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() i = 0 skip = False all_rules_fst_lst = [] line_lst = [] for line_nl in fileinput.input(args.rules): i += 1 if not line_lst: line_nl_lst = [] line_nl_lst.append(line_nl) line = line_nl.split('!', maxsplit=1)[0].strip() if line == "START": skip = False continue elif line == "STOP": skip = True if skip or (not line) or line.startswith("!"): continue line_lst.append(line) if not line.endswith(";"): continue else: rule_str = " ".join(line_lst) line_lst = [] op, left, right = parse_rule(parser, rule_str, i, line_nl_lst) if op == "?" or not (left and right): continue if (args.thorough > 0 and op != "=") or cfg.verbosity > 0: print("\n") print(rule_str) if op == "=": # if cfg.verbosity > 0: # print(line) if cfg.verbosity >= 10: print(left, op) twbt.ppfst(right) continue elif op == "=>": R, selector_fst, MIXe = twrule.rightarrow(line, left, *right) elif op == "<=": R, selector_fst, MIXe = twrule.output_coercion(line, left, *right) elif op == "<--": R, selector_fst, MIXe = twrule.input_coercion(line, left, *right) elif op == "<=>": R, selector_fst, MIXe = twrule.doublearrow(line, left, *right) elif op == "/<=": R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right) else: print("Error: not a valid type of a rule", op) continue if cfg.verbosity >= 10: twbt.ppfst(R) if args.lost or args.wrong or args.output: all_rules_fst_lst.append(R) if args.thorough > 0: selector_fst.intersect(cfg.examples_fst) # selector_fst.n_best(5) selector_fst.minimize() if cfg.verbosity >= 20: paths = selector_fst.extract_paths(output='raw') print_raw_paths(paths[0:20]) passed_pos_examples_fst = selector_fst.copy() passed_pos_examples_fst.intersect(R) if args.thorough > 0: if passed_pos_examples_fst.compare(selector_fst): print("All positive examples accepted") else: lost_examples_fst = selector_fst.copy() lost_examples_fst.minus(passed_pos_examples_fst) lost_examples_fst.minimize() print("** Some positive examples were rejected:") lost_paths = lost_examples_fst.extract_paths(output='raw') print_raw_paths(lost_paths[0:20]) if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}: neg_examples_fsa = examples_fsa.copy() neg_examples_fsa.compose(MIXe) neg_examples_fsa.output_project() neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^") neg_examples_fst.minus(cfg.examples_fst) NG = examples_up_fsa.copy() NG.compose(neg_examples_fst) npaths = NG.extract_paths(output='raw') #print_raw_paths(npaths) passed_neg_examples_fst = NG.copy() passed_neg_examples_fst.intersect(R) if passed_neg_examples_fst.compare(hfst.empty_fst()): print("All negative examples rejected") else: print("** Some negative examples accepted:") npaths = passed_neg_examples_fst.extract_paths(output='raw') print_raw_paths(npaths[0:20]) if args.lost or args.wrong: RESU = examples_up_fsa.copy() print(RESU.number_of_arcs(), "arcs in RESU") RESU.compose_intersect(tuple(all_rules_fst_lst)) RESU.minimize() if args.lost: lost_positive_examples_fst = cfg.examples_fst.copy() lost_positive_examples_fst.minus(RESU) lost_positive_examples_fst.minimize() lost_stream = hfst.HfstOutputStream(filename=args.lost) lost_stream.write(lost_positive_examples_fst) lost_stream.flush() lost_stream.close() print("wrote lost examples to", args.lost) if args.wrong: WRONG = RESU.copy() WRONG.subtract(cfg.examples_fst) WRONG.minimize() wrong_stream = hfst.HfstOutputStream(filename=args.wrong) wrong_stream.write(WRONG) wrong_stream.flush() wrong_stream.close() print("wrote wrongly accepted examples to", args.wrong) if args.output: outstream = hfst.HfstOutputStream(filename=args.output) for fst in all_rules_fst_lst: outstream.write(fst) outstream.flush() outstream.close() print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst), args.output)) return
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( "twol-aligner", description="""Aligns pairs of words separated by a colon. See https://pytwolc.readthedocs.io/en/latest/alignment.html for detailed instructions. Version {}""".format(version)) arpar.add_argument( "metrics", help="FST computed with twol-metric from an alphabet file." " The FST contains weights for phoneme correspondences.") arpar.add_argument("-d", "--delimiter", help="Separates the two cognates, default is ' '", default=" ") arpar.add_argument("-l", "--layout", choices=["vertical", "list", "horizontal"], help="output layout", default="vertical") arpar.add_argument("-c", "--comment-separator", help="""Comment separator. Comments in input after this character are just copied to output. Input words are then also copied to the end of comments. Default separator is '' i.e. no comments. Comments come to the output only in horizontal layout.""", default="") arpar.add_argument("-w", "--weights", help="print also the weight of each alignment." " Default is not to print." " Works only if a comment separator is also set.", action="store_true") arpar.add_argument( "-n", "--number", help="number of best results to be printed. Default is 1", type=int, default=1) arpar.add_argument("-v", "--verbosity", help="Level of diagnostic information to be printed. " "Default is 0", type=int, default=0) args = arpar.parse_args() cfg.verbosity = args.verbosity algfile = hfst.HfstInputStream(args.metrics) aligner_fst = algfile.read() separator = args.delimiter import sys import twol.multialign as multialign for line in sys.stdin: if args.comment_separator: pair, comm, comments = \ line.strip().partition(args.comment_separator) else: pair, comm, comments = line.strip(), "", "" if args.verbosity > 0: print(pair, args.comment_separator, comm) in_word, sep, out_word = pair.strip().partition(args.delimiter) if not out_word: out_word = in_word raw_paths = align_two_words(in_word, out_word, aligner_fst, "Ø", args.number) for aligned_result in raw_paths: print_result(aligned_result, comments, args.weights, layout=args.layout) return
def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( "twol-multialign", description="""Version {}\nAligns lists of words separated by a DELIMITER. See https://pytwolc.readthedocs.io/en/latest/alignment.html for detailed instructions. """.format(version)) arpar.add_argument( "alphabet", help="An alphabet definition file with features and similarity sets.") arpar.add_argument( "-d", "--delimiter", help="Separates the two cognates, default is ' '", default=" ") arpar.add_argument( "-l", "--layout", choices=["vertical","list","horizontal"], help="output layout", default="vertical") arpar.add_argument( "-c", "--comment-separator", help="""Comment separator. Comments in input after this character are just copied to output. Input words are then also copied to the end of comments. Default separator is '' i.e. no comments. Comments come to the output only in horizontal layout.""", default="") arpar.add_argument( "-w", "--weights", help="print also the weight of each alignment." " Default is not to print." " Works only if a comment separator is also set.", action="store_true") arpar.add_argument( "-x", "--extra-zeros", default=0, type=int, help="number of extra zeros to be tried in alignment") arpar.add_argument( "-n", "--number", help="number of best results to be printed. Default is 1", type=int, default=1) arpar.add_argument( "-v", "--verbosity", help="Level of diagnostic information to be printed. " "Default is 0", type=int, default=0) args = arpar.parse_args() if args.verbosity: cfg.verbosity = args.verbosity init(args.alphabet, all_zero_weight=1000) separator = args.delimiter import sys for line in sys.stdin: if args.comment_separator: word_str, comm, comments = \ line.strip().partition(args.comment_separator) else: word_str, comm, comments = line.strip(), "", "" if args.verbosity > 0: print(word_str, args.comment_separator, comm) word_lst = word_str.strip().split(args.delimiter) aligned_results_lst = multialign(word_lst, zero="Ø", max_zeros=args.extra_zeros, best_count=args.number) if cfg.verbosity >= 10: print("aligned_results_lst:", aligned_results_lst) for aligned_result in aligned_results_lst: print_result(aligned_result, comments, args.weights, layout=args.layout) return
def main(): import twol.cfg as cfg version = cfg.timestamp(__file__) import argparse argparser = argparse.ArgumentParser( "python3 words2zerofilled.py", description="Aligns a set of word forms with morph boundaries"\ " Version {} ".format(version)) argparser.add_argument( "input", default="ksk-seg-examp.csv", help="moprheme names and segmented example words as a CSV file") argparser.add_argument( "output", default="ksk-alig-examp.csv", help="example words plus zero-filled aligned forms as a CSV file") argparser.add_argument( "alphabet", default="alphabet-test.text", help="An alphabet definition which determines"\ " the weights for morphophonemes") argparser.add_argument( "-s", "--morph-separator", default=".", help="Separator between morphs in the word form, default is '.'") argparser.add_argument("-d", "--csv-delimiter", default=",", help="Delimiter between the fields") argparser.add_argument( "-n", "--name-separator", default=".", help="separator between morpheme names"\ " in the morpheme list,, default is '.'") argparser.add_argument( "-z", "--zero-symbol", default="Ø", help="symbol to be inserted in word forms to align them") argparser.add_argument( "-x", "--extra-zeros", default=0, type=int, help="number of extra zeros to be tried in alighnment") argparser.add_argument("-v", "--verbosity", default=0, type=int, help="level of diagnostic and debugging output") args = argparser.parse_args() import re import csv import collections import grapheme cfg.verbosity = args.verbosity # STEP 1: # Read in the segmented words and collect the allomorphs of each morpheme morphs_of_morpheme = {} """A dict to which allomorphs of each morpheme are collected: morphs_of_morpheme[morpheme_name] == ordered list its unique allomorphs. """ seg_example_list = [] """A list to which of all example words are collected. Each word is represented as a list of (morpheme,morph) pairs. """ stem_name_set = set() """Set of stem morphemes i.e. names of stem morphemes. """ csvfile = open(args.input) reader = csv.DictReader(csvfile, delimiter=args.csv_delimiter, skipinitialspace=True) i = 0 morphs_of_morpheme = {} for row in reader: morpheme_list = row["MORPHEMES"].strip().split(args.name_separator) morph_list = row["MORPHS"].strip().split(args.morph_separator) if args.verbosity >= 25: print(row["MORPHEMES"]) print(morpheme_list) print(row["MORPHS"]) print(morph_list) i = i + 1 if len(morpheme_list) != len(morph_list): print("** line", i, ":", row["MORPHEMES"], "is incompatible with", row["MORPHS"]) continue if not morpheme_list: continue stem_name_set.add(morpheme_list[0]) name_morph_pair_lst = list(zip(morpheme_list, morph_list)) if args.verbosity >= 10: print("name_morph_pair_lst", name_morph_pair_lst) seg_example_list.append(name_morph_pair_lst) for morpheme, morph in name_morph_pair_lst: if args.verbosity >= 10: print("morpheme, morph:", morpheme, morph) morph = morph.strip() if morpheme not in morphs_of_morpheme: morphs_of_morpheme[morpheme] = [morph] else: if morph not in morphs_of_morpheme[morpheme]: morphs = morphs_of_morpheme[morpheme] morphs.append(morph) morphs_of_morpheme[morpheme] = morphs if args.verbosity >= 5: print("morphs_of_morpheme", morphs_of_morpheme) csvfile.close() print("-- STEP 1 COMPLETED (seg_example_list, stem_name_set," " morphs_of_morpheme done)--") # STEP 2: # align the allomorphs of each morpheme import twol.cfg as cfg #cfg.all_zero_weight = 1.0 import twol.multialign as multialign multialign.init(args.alphabet, all_zero_weight=1) alignments = {} """All aligned morphs. index: morpheme name, value: sequence of aligned symbols. Each aligned symbol has as many characters as there are items in the sequence. """ for morpheme in sorted(morphs_of_morpheme.keys()): morphs = morphs_of_morpheme[morpheme] if len(morphs) == 1 and len(morphs[0]) == 0: aligned_morphs_lst = [] else: if args.verbosity >= 5: print("morphs:", morphs) aligned_results_lst = \ multialign.multialign(morphs, max_zeros=args.extra_zeros, best_count=1) if aligned_results_lst: weight, aligned_morphs_lst = aligned_results_lst[0] else: aligned_morphs_lst = [] if args.verbosity >= 5: print("aligned_results_lst:", aligned_results_lst) alignments[morpheme] = aligned_morphs_lst print("-- STEP 2 COMPLETED (alignments done) --") # STEP 3: # Compute the zero filled morphs out of the sequences of aligned symbols aligned_morphs = {} """index: (morpheme, morph), value: zero-filled morph """ for morpheme, aligned_morphs_lst in alignments.items(): # e.g. "KOTA", ['kota', 'koda', 'kotØ', 'kodØ'] if args.verbosity >= 5: print("aligned_morphs_lst:", aligned_morphs_lst) if morpheme not in aligned_morphs: aligned_morphs[morpheme] = collections.OrderedDict() if aligned_morphs_lst: original_morphs = [x.replace("Ø", "") for x in aligned_morphs_lst] for origm, zerofm in zip(original_morphs, aligned_morphs_lst): #if origm: # aligned_morphs[morpheme][origm] = zerofm aligned_morphs[morpheme][origm] = zerofm else: aligned_morphs[morpheme] = {"": ""} if args.verbosity >= 5: print("aligned_morphs", aligned_morphs) print("-- STEP 3 COMPLETED (aligned_morphs done) --") # STEP 4: # Write the example word forms plus their a zero filled morphs out_file = open(args.output, "w", newline="") writer = csv.DictWriter(out_file, ["MORPHEMES", "MORPHS", "ZEROFILLED"], delimiter=args.csv_delimiter) forms_of_morphs = {} writer.writeheader() d = {} for seg_example in seg_example_list: if args.verbosity >= 20: print("seg_example:", seg_example) morpheme_lst = [morpheme for morpheme, morph in seg_example] morph_lst = [morph for morpheme, morph in seg_example] zero_filled_morph_lst = \ [aligned_morphs[morpheme].get(morph.replace("Ø", ""), "") for (morpheme, morph) in seg_example] if args.verbosity >= 20: print("zero_filled_morph_lst:", zero_filled_morph_lst) d["MORPHEMES"] = args.name_separator.join(morpheme_lst) d["MORPHS"] = args.morph_separator.join(morph_lst) d["ZEROFILLED"] = args.morph_separator.join(zero_filled_morph_lst) writer.writerow(d) if morph_lst[0] not in forms_of_morphs: forms_of_morphs[morph_lst[0]] = set() forms_of_morphs[morph_lst[0]].add(" ".join(x for x in morpheme_lst[1:])) print("-- STEP 4 COMPLETED (zero-filled morphs and the CSV file done) --") return