def write_to_cumulog(clargs=None): """Write config infos and accuracy measures to cumulative log""" inf = {} inf["run_id"] = prep.find_run_id() try: inf["revnum"] = prep.find_git_revnum() except OSError: print "- Can't get git revision number (OSError)" inf["revnum"] = "XXXXX" if clargs.comment is not None and clargs.comment != "": inf["run_comment"] = clargs.comment else: inf["run_comment"] = tc.COMMENT if clargs.maxdista is not None: inf["maxdista"] = clargs.maxdsita else: inf["maxdista"] = tc.maxdista if clargs.distaw is not None: inf["distaw"] = clargs.distaw else: inf["distaw"] = tc.distaw if clargs.lmw is not None: inf["lmw"] = clargs.lmw else: inf["lmw"] = tc.lmw if clargs.lmpath is not None: inf["lmpath"] = os.path.basename(clargs.lmpath) else: inf["lmpath"] = os.path.basename(tc.lmpath) if clargs.lm_window is not None: inf["lm_window"] = tc.lm_window else: inf["lm_window"] = tc.lm_window inf["increment_norm"] = tc.increment_norm inf["accept_all_IV_regex_outputs"] = tc.accept_all_IV_regex_outputs outhead = "Run ID [{0}], RevNum [{1}] {2}\n".format( inf["run_id"], inf["revnum"], "=" * 50) with codecs.open(tc.EVALFN.format(prep.find_run_id()), "r", "utf8") as done_res: with codecs.open(tc.CUMULOG, "a", "utf8") as cumu_res: cumu_res.write(outhead) cumu_res.write("RunComment: {}\n".format(inf["run_comment"])) for key in [ "maxdista", "distaw", "lmw", "lmpath", "increment_norm" ]: cumu_res.write("{}: {}\n".format(key, inf[key])) cumu_res.write("".join(done_res.readlines()[-4:]))
def write_out(corr_dico): """Write out the final hash in a format that matches reference output format""" with codecs.open(tc.id_order, "r", "utf8") as idor: orderlist = [idn.strip() for idn in idor.readlines()] with codecs.open(tc.OUTFN.format(prep.find_run_id()), "w", "utf8") as outfh: for tid in orderlist: if tid in corr_dico: outfh.write("%s\n" % tid) for oov_corr in corr_dico[tid]: outfh.write("\t%s\t%s\n" % (oov_corr[0], oov_corr[1]))
def preliminary_preps(): """Set up logger and read command line arguments""" # logger logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id()) lgr, lfh = prep.set_log(__name__, logfile_name, False) # cl options clargs = set_option_parser() if clargs.tag is not None and clargs.tag: tc.TAG = True elif clargs is not None and not clargs.tag: tc.TAG = False #TODO: options below don't seem to be able to affect tc other than for writing to the cumulog elif clargs.maxdista is not None: tc.maxdista = clargs.maxdista elif clargs.distaw is not None: tc.distaw = clargs.distaw elif clargs.lmw is not None: tc.lmw = clargs.lmw return lgr, lfh, clargs
# -*- coding: utf-8 -*- import codecs from collections import defaultdict import os import re import tnconfig as tc import preparation as prep # logging logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id()) lgr, lfh = prep.set_log(__name__, logfile_name, False) class EdScoreMatrix: """Methods to read cost matrix from module in arg cost_module and to find costs for individual character-edits.""" def __init__(self, cost_module): self.costm = cost_module row_names = None col_names = None matrix_conts = None accented_chars = [u'\xe1', u'\xe9', u'\xed', u'\xf1', u'\xf3', u'\xfa', u'\xfc'] matrix_stats = {"max" : None, "min" : None, "ave" : None} def read_cost_matrix(self): """Read cost matrix into a hash. Set instance values for them""" row_names = self.costm.row_names.strip().split("\t") col_names = self.costm.col_names.strip().split("\t") costs = self.costm.costs
import codecs import inspect import os import logging import psutil import sys import time import preparation as prep import tnconfig as tc # logging logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id(increase=True)) lgr, lfh = prep.set_log(__name__, logfile_name, False) def check_server(port): """Check if Freeling server is running on port""" listening = False flprocs = [p for p in psutil.get_process_list() if p.name.startswith("analyze")] for flp in flprocs: flpcons = flp.get_connections() if len(flpcons) > 0 and flpcons[0].local_address[1] == port: listening = True break if listening: return True return False def start_server(servtype="default"):
def main(): global lgr global tweet global clargs global ref_OOVs # debug global all_tweets # debug global safe_rules global rerules global abbrules global rinrules global ivs global ent_hash global entmgr global ppro global edimgr global stpwords global outdico global all_tweeto all_tweets = [] # debug # prep --------------------------------------------------------------------- lgr, lfh, clargs = preliminary_preps() # processing --------------------------------------------------------------- # Check if need to delete in-memory IV and entities dicos (if just changed config) #ok = raw_input("Need to reset the IV dictionary (if changed tc.merged_iv_and_entities)? [y] to reset\n") #if ok == "y": # print "- Deleting 'ivs' (Imerged IV + ent) in current scope" # delattr(sys.modules[__name__], "ivs") # if "ivs_only" in dir(sys.modules["__main__"]): # print "- Deleting 'ivs_only' (IV) in current scope" # delattr(sys.modules[__name__], "ivs_only") corpusname = {True: "test", False: "dev"} print "Corpus: {0}".format(corpusname[tc.EVAL]) print "Comment: {0}".format(tc.COMMENT) print "Start {0}".format(time.asctime(time.localtime())) print "Run ID: %s" % prep.find_run_id() try: lgr.info("Run {0} START | Rev [{1}] {2}".format( tc.RUNID, prep.find_git_revnum(), "=" * 60)) except OSError: lgr.info("Run {0} START | Rev [{1}] {2}".format( tc.RUNID, "XXXX", "=" * 60)) print "= main: preliminary preps" id_order = prep.find_id_order() ref_OOVs = prep.find_ref_OOVs(tc.ANNOTS) textdico = prep.grab_texts(tc.TEXTS) call_freeling(textdico) print "= main: load analyzers" ppro, safe_rules, rerules, abbrules, rinrules = load_preprocessing() ent_hash = load_entities() if tc.merge_iv_and_entities: ivs = merge_iv_and_entities(ivs, ent_hash) edimgr = load_distance_editor() slmmgr, binslm = load_lm() entmgr = load_entity_manager(ent_hash, ivs, edimgr, lmmgr) stpwords = stopwords.words('english') print "= twittero: creating Tweet instances" all_tweeto, outdico = parse_tweets(textdico) print "= main: create baseline" baseline_dico = get_baseline_results(all_tweeto) if not tc.BASELINE: print "= main: NORMALIZATION" x = 0 for tid in all_tweeto: lgr.debug("NORMALIZING, TID [{0}]".format(tid)) tweet = all_tweeto[tid] for tok in tweet.toks: if not isinstance(tok, OOV): continue oov = tok # easier label if tc.activate_prepro: # separate prepro components switched on/off inside preprocess(oov) preprocess(oov) if tc.use_ed: create_edit_candidates(oov) find_lm_scores(oov) rank_candidates(oov) rank_before_entities(oov) if tc.use_entities: cf_with_ent(oov) x += 1 #if x == 10: break #debug if x % 100 == 0: print("Done {0} tweets, {1}".format( x, time.asctime(time.localtime()))) # Extra step to add more entity candidates if tc.use_lmall: print "= Adding extra entities, {0}".format( time.asctime(time.localtime())) add_extra_entities() print "= Done" #outdico = populate_outdico(all_tweeto, outdico) # old, now use populate_easy if tc.generic_workflow or tc.use_entities: # Doesn't cover all cases. Enough for paper-tests wf = "aft" else: wf = "bef" outdico = populate_easy(all_tweeto, outdico, wf) # write-out ---------------------------------------------------------------- print "= writer" lgr.info("Writing out") if tc.BASELINE: chosen_outdico = baseline_dico else: chosen_outdico = outdico write_out(chosen_outdico) # evaluation --------------------------------------------------------------- print "= evaluation" lgr.info("Running evaluation") neval.main(tc.ANNOTS, tc.OUTFN.format(prep.find_run_id())) write_to_cumulog(clargs=clargs) lgr.removeHandler(lfh) print "End {0}".format(time.asctime(time.localtime()))
def write_to_cumulog(clargs=None): """Write config infos and accuracy measures to cumulative log""" global golden_set_res global all_tweeto inf = {} inf["run_id"] = prep.find_run_id() try: inf["revnum"] = prep.find_git_revnum() except OSError: print "- Can't get git revision number (OSError)" inf["revnum"] = "XXXXX" if clargs.comment is not None and clargs.comment != "": inf["run_comment"] = clargs.comment else: inf["run_comment"] = tc.COMMENT inf["generic_lev"] = tc.generic_lev if clargs.maxdista is not None: inf["maxdista"] = clargs.maxdsita else: inf["maxdista"] = tc.maxdista if clargs.distaw is not None: inf["distaw"] = clargs.distaw else: inf["distaw"] = tc.distaw if clargs.lmw is not None: inf["lmw"] = clargs.lmw else: inf["lmw"] = tc.lmw if clargs.lmpath is not None: inf["lmpath"] = os.path.basename(clargs.lmpath) else: inf["lmpath"] = os.path.basename(tc.lmpath) if clargs.lm_window is not None: inf["lm_window"] = tc.lm_window else: inf["lm_window"] = tc.lm_window inf["increment_norm"] = tc.increment_norm inf["accept_all_IV_regex_outputs"] = tc.accept_all_IV_regex_outputs inf["merge_iv_and_entities"] = tc.merge_iv_and_entities inf["accent_check_in_regexes"] = tc.accent_check_in_regexes if tc.EVAL: inf["corpus"] = "test" else: inf["corpus"] = "dev" golden_set_res = tnstats.hash_gold_standard(tc.ANNOTS) coverage_info, coverage_stats = tnstats.get_upper_bound( golden_set_res, all_tweeto.values()) envs_dico = {"W": "work", "H": "home", "S": "hslt-server"} inf["enviro"] = envs_dico[tc.ENV] wf_dico = {True: "lm_all", False: "lm_one"} inf["lm_app"] = wf_dico[tc.use_lmall] outhead = "== Run ID [{0}], RevNum [{1}] {2}\n".format( inf["run_id"], inf["revnum"], "=" * 48) with codecs.open(tc.EVALFN.format(prep.find_run_id()), "r", "utf8") as done_res: with codecs.open(tc.CUMULOG, "a", "utf8") as cumu_res: cumu_res.write(outhead) cumu_res.write("RunComment: {0}\n".format(inf["run_comment"])) for key in [ "enviro", "corpus", "lm_app", "generic_lev", "maxdista", "distaw", "accent_check_in_regexes", "lmw", "lmpath", "increment_norm", "accept_all_IV_regex_outputs", "merge_iv_and_entities" ]: cumu_res.write("- {0}: {1}\n".format(key, inf[key])) iso_cumu_settings_list = [ 'tc.no_postprocessing', 'tc.activate_prepro', 'tc.safelist_end', 'tc.abbrev_end', 'tc.use_regexes', 'tc.use_ed', 'tc.context_sens_ed', 'tc.use_entities' ] iso_cumu_settings_dict = dict( (name, eval(name)) for name in iso_cumu_settings_list) cumu_res.write("+ Isolating/Cumulative Module Settings +\n") for setting in iso_cumu_settings_dict: cumu_res.write("- {0}: {1}\n".format( setting, iso_cumu_settings_dict[setting])) cumu_res.write("+ Upper Bound +\n") for stat in coverage_stats: cumu_res.write("- {0}: {1}\n".format(stat, coverage_stats[stat])) cumu_res.write("".join(done_res.readlines()[-4:])) done_res.seek(0, 0) print "+ Results +" print "".join(done_res.readlines()[-4:])
def main(): global lgr global tweet global clargs global ref_OOVs # debug global all_tweets # debug global safe_rules global rerules global ppro global edimgr global outdico all_tweets = [] # debug # prep --------------------------------------------------------------------- lgr, lfh, clargs = preliminary_preps() # processing --------------------------------------------------------------- print "Start {0}".format(time.asctime(time.localtime())) print "Run ID: %s" % prep.find_run_id() lgr.info("Run {0} START | Rev [{1}] {2}".format(tc.RUNID, prep.find_git_revnum(), "=" * 60)) print "= main: preliminary preps" id_order = prep.find_id_order() ref_OOVs = prep.find_ref_OOVs(tc.ANNOTS) textdico = prep.grab_texts(tc.TEXTS) call_freeling(textdico) print "= main: load analyzers" ppro, safe_rules, rerules = load_preprocessing() edimgr = load_distance_editor() slmmgr, binslm = load_lm() print "= twittero: creating Tweet instances" all_tweeto, outdico = parse_tweets(textdico) print "= main: create baseline" baseline_dico = get_baseline_results(all_tweeto) print "= main: NORMALIZATION" x = 0 for tid in all_tweeto: lgr.debug("NORMALIZING, TID [{0}]".format(tid)) tweet = all_tweeto[tid] for tok in tweet.toks: if not isinstance(tok, OOV): continue oov = tok # easier label preprocess(oov) create_edit_candidates(oov) find_lm_scores(oov) rank_candidates(oov) x += 1 if x % 100 == 0: print("Done {0} tweets, {1}".format(x, time.asctime( time.localtime()))) outdico = populate_outdico(all_tweeto, outdico) # write-out ---------------------------------------------------------------- print "= writer" lgr.info("Writing out") if tc.BASELINE: chosen_outdico = baseline_dico else: chosen_outdico = outdico write_out(chosen_outdico) # evaluation --------------------------------------------------------------- print "= evaluation" lgr.info("Running evaluation") neval.main(tc.ANNOTS, tc.OUTFN.format(prep.find_run_id())) write_to_cumulog(clargs=clargs) lgr.removeHandler(lfh) print "End {0}".format(time.asctime(time.localtime()))