def main(): # initialise argument parser ap = argparse.ArgumentParser(description="Guess more data for Finnish TSV databases") ap.add_argument( "--quiet", "-q", action="store_false", dest="verbose", default=False, help="do not print output to stdout while processing", ) ap.add_argument( "--verbose", "-v", action="store_true", default=False, help="print each step to stdout while processing" ) ap.add_argument("--input", "-i", action="store", required=True, metavar="IFILE", help="read data from IFILE") ap.add_argument("--version", "-V", action="version") ap.add_argument("--output", "-o", action="store", required=True, metavar="OFILE", help="write data to OFILE") ap.add_argument("--fields", "-f", action="store", default=2, metavar="N", help="read N fields from master") ap.add_argument("--join", "-j", action="store", required=True, metavar="JFILE", help="read join fields from JFILE") ap.add_argument( "--stub", "-c", action="store", required=True, metavar="SFILE", help="read stub expressions from SFILE" ) ap.add_argument("--separator", "-s", action="store", default="\t", metavar="SEP", help="use SEP as separator") ap.add_argument( "--comment", "-C", action="append", default=["#"], metavar="COMMENT", help="skip lines starting with COMMENT that" "do not have SEPs", ) ap.add_argument("--strip", "-S", action="store", metavar="STRIP", help="strip STRIP characters") args = ap.parse_args() if args.strip == '"' or args.strip == "'": quoting = csv.QUOTE_ALL quotechar = args.strip else: quoting = csv.QUOTE_NONE quotechar = None errors = False joinmap = dict() # read joins from file if any with open(args.join, "r", newline="") as joins: join_reader = csv.DictReader(joins, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True) for join_parts in join_reader: if len(join_parts) < 3: print("Must have at leas N separators in joins; skipping", join_parts) continue key = join_parts["new_para"] joinmap[key] = join_parts stubmap = dict() with open(args.stub, "r", newline="") as stubs: stub_reader = csv.DictReader(stubs, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True) for stub_parts in stub_reader: if len(stub_parts) < 2: print("Must have at least N separators in stubbings; skipping", stub_parts) continue key = stub_parts["new_para"] stubmap[key] = stub_parts["deletion"] # read from csv files with open(args.output, "w", newline="") as output: tsv_writer = csv.DictWriter( output, fieldnames=get_wordmap_fieldnames(), delimiter=args.separator, quoting=quoting, escapechar="%", quotechar=quotechar, strict=True, ) tsv_writer.writeheader() with open(args.input, "r", newline="") as infile: tsv_reader = csv.reader(infile, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True) linecount = 0 for tsv_parts in tsv_reader: linecount += 1 if args.verbose and (linecount % 10000 == 0): print(linecount, "...", sep="", end="\r") if len(tsv_parts) < args.fields: print( "Must have at least N separators on each", "non-comment non-empty line; skipping:", tsv_parts, file=stderr, ) continue # here starts the guessworks # the aim is to fill dict wordmap with data necessary to # generate a lexc line wordmap = init_wordmap() wordmap = parse_defaults_from_tsv(wordmap, tsv_parts) wordmap = parse_extras_from_tsv(wordmap, tsv_parts) # Extend from known new paras joinkey = wordmap["new_para"] if joinkey in joinmap: for k, v in joinmap[joinkey].items(): if k != "new_para": if v == "False": wordmap[k] = False elif v == "None": wordmap[k] = None elif k == "kotus_tn": try: wordmap[k] = int(v) except: print("FAIL", k, v, tsv_parts) exit(2) else: wordmap[k] = v else: print( "\033[93mMissing!\033[0m", "new para not in join data:", joinkey, "\n\033[92mExplanation:\033[0m" "add paradigm to morphophonology.tsv and carry on", ) errors = True continue # Guess-works in order # wordmap = guess_stem_features_ktn(wordmap) # wordmap = guess_pronunciation(wordmap) # wordmap = guess_grade_dir_from_ktn(wordmap) # wordmap = guess_harmony(wordmap) # wordmap = guess_new_class(wordmap) wordmap = stub_all_new_para(wordmap, stubmap) if not wordmap: errors = True continue # suffixes can be id'd by the - in beginning. They need an own # lexicon wordmap = guess_bound_morphs(wordmap) if wordmap["is_suffix"]: wordmap["real_pos"] = wordmap["pos"] wordmap["pos"] = "SUFFIX" if "PCLE_HAH" == wordmap["new_para"]: wordmap["real_pos"] = wordmap["pos"] wordmap["pos"] = "INTERJECTION" wordmaps = [wordmap] for wordmap in wordmaps: tsv_writer.writerow(wordmap) if errors: print("you must fix database integrity or hack the scripts", "before continuing") exit(1) exit()
def main(): # initialise argument parser ap = argparse.ArgumentParser( description="Guess more data for Finnish TSV databases") ap.add_argument("--quiet", "-q", action="store_false", dest="verbose", default=False, help="do not print output to stdout while processing") ap.add_argument("--verbose", "-v", action="store_true", default=False, help="print each step to stdout while processing") ap.add_argument("--input", "-i", action="store", required=True, metavar="IFILE", help="read data from IFILE") ap.add_argument("--version", "-V", action="version") ap.add_argument("--output", "-o", action="store", required=True, metavar="OFILE", help="write data to OFILE") ap.add_argument("--fields", "-f", action="store", default=2, metavar="N", help="read N fields from master") ap.add_argument("--join", "-j", action="store", required=True, metavar="JFILE", help="read join fields from JFILE") ap.add_argument("--stub", "-c", action="store", required=True, metavar="SFILE", help="read stub expressions from SFILE") ap.add_argument("--separator", "-s", action="store", default="\t", metavar="SEP", help="use SEP as separator") ap.add_argument("--comment", "-C", action="append", default=["#"], metavar="COMMENT", help="skip lines starting with COMMENT that" "do not have SEPs") ap.add_argument("--strip", "-S", action="store", metavar="STRIP", help="strip STRIP characters") args = ap.parse_args() if args.strip == '"' or args.strip == "'": quoting = csv.QUOTE_ALL quotechar = args.strip else: quoting = csv.QUOTE_NONE quotechar = None errors = False joinmap = dict() # read joins from file if any with open(args.join, 'r', newline='') as joins: join_reader = csv.DictReader(joins, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) for join_parts in join_reader: if len(join_parts) < 3: print("Must have at leas N separators in joins; skipping", join_parts) continue key = join_parts['new_para'] joinmap[key] = join_parts stubmap = dict() with open(args.stub, 'r', newline='') as stubs: stub_reader = csv.DictReader(stubs, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) for stub_parts in stub_reader: if len(stub_parts) < 2: print("Must have at least N separators in stubbings; skipping", stub_parts) continue key = stub_parts['new_para'] stubmap[key] = stub_parts['deletion'] # read from csv files with open(args.output, 'w', newline='') as output: tsv_writer = csv.DictWriter(output, fieldnames=get_wordmap_fieldnames(), delimiter=args.separator, quoting=quoting, escapechar='%', quotechar=quotechar, strict=True) tsv_writer.writeheader() with open(args.input, 'r', newline='') as infile: tsv_reader = csv.reader(infile, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) linecount = 0 for tsv_parts in tsv_reader: linecount += 1 if args.verbose and (linecount % 10000 == 0): print(linecount, "...", sep='', end='\r') if len(tsv_parts) < args.fields: print("Must have at least N separators on each", "non-comment non-empty line; skipping:", tsv_parts, file=stderr) continue # here starts the guessworks # the aim is to fill dict wordmap with data necessary to # generate a lexc line wordmap = init_wordmap() wordmap = parse_defaults_from_tsv(wordmap, tsv_parts) wordmap = parse_extras_from_tsv(wordmap, tsv_parts) # Extend from known new paras joinkey = wordmap['new_para'] if joinkey in joinmap: for k, v in joinmap[joinkey].items(): if k != 'new_para': if v == "False": wordmap[k] = False elif v == "None": wordmap[k] = None elif k == 'kotus_tn': try: wordmap[k] = int(v) except: print("FAIL", k, v, tsv_parts) exit(2) else: wordmap[k] = v else: print( "\033[93mMissing!\033[0m", "new para not in join data:", joinkey, "\n\033[92mExplanation:\033[0m" "add paradigm to morphophonology.tsv and carry on") errors = True continue # Guess-works in order # wordmap = guess_stem_features_ktn(wordmap) # wordmap = guess_pronunciation(wordmap) # wordmap = guess_grade_dir_from_ktn(wordmap) # wordmap = guess_harmony(wordmap) # wordmap = guess_new_class(wordmap) wordmap = stub_all_new_para(wordmap, stubmap) if not wordmap: errors = True continue # suffixes can be id'd by the - in beginning. They need an own # lexicon wordmap = guess_bound_morphs(wordmap) if wordmap['is_suffix']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'SUFFIX' if "PCLE_HAH" == wordmap['new_para']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'INTERJECTION' wordmaps = [wordmap] for wordmap in wordmaps: tsv_writer.writerow(wordmap) if errors: print("you must fix database integrity or hack the scripts", "before continuing") exit(1) exit()
def main(): ap = argparse.ArgumentParser(description="Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields") ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument('--output', '-o', metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument('--plt-file', '-p', metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument('--verbose', '-v', action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, 'r', newline='') as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(',', 1) plt_info[lex] = plt.strip('"') elif line.find('HEADERS') >= 0: headers_skipped = True if args.input: input = open(args.input, 'r', newline='') else: input = stdin if args.output: output = open(args.output, 'w', newline='') else: output = stdout for line in input: if line.startswith('#') or line.find('<-HEADERS') >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap['stub'] = wordmap['lemma'] = fields[0] if args.verbose: print(wordmap['lemma']) wordmap['kotus_tn'] = int(fields[1]) wordmap['kotus_av'] = fields[2] if wordmap['kotus_av'] == '0': wordmap['kotus_av'] = False wordmap['pos'] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap['plurale_tantum'] = plt_info[ '"' + '","'.join(fields[0:4]) + '"'] for i in range(4, len(fields)): if fields[i].startswith('plt='): wordmap['plurale_tantum'] = fields[i] elif fields[i].startswith('boundaries='): fields[i] = fields[i].replace('|', '{WB}') wordmap['stub'] = wordmap['boundaries'] = fields[i] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap['extras'] = '\t'.join(fields[4:]) if wordmap['extras']: wordmap['extras'] = '\t' + wordmap['extras'] if args.verbose: print("Guessed new para: %(new_para)r" % (wordmap)) print("%(lemma)s\t%(new_para)r%(extras)s" % (wordmap), file=output) input.close() output.close() exit()