def __init__(self, grmr: str, tmpl: str, limit: int, parser: AbstractFileParserClient, evt_handler: AbstractStatEventHandler = None): if parser is None: raise GrammarTestError( "GrammarTestError: 'parser' argument can not be None.") if not isinstance(parser, AbstractFileParserClient): raise GrammarTestError( "GrammarTestError: 'parser' is not an instance of AbstractFileParserClient" ) if evt_handler is not None and not isinstance( evt_handler, AbstractStatEventHandler): raise GrammarTestError( "ArgumentError: 'evt_handler' is not an instance of AbstractStatEventHandler" ) self._parser = parser self._event_handler = evt_handler self._grammar_root = grmr self._template_dir = tmpl self._linkage_limit = limit self._options = 0 # options self._is_dir_corpus = False self._is_dir_dict = False self._total_metrics = ParseMetrics() self._total_quality = ParseQuality() self._total_files = 0 self._total_dicts = 0
def eval_parses(test_parses: list, ref_parses: list, verbose: bool, ofile=sys.stdout) -> ParseQuality: """ Compares test_parses against ref_parses link by link counting errors :param test_parses: List of test parses in format, prepared by get_parses. :param ref_parses: List of reference parses. :param verbose: Boolean value which enables intermediate result output if set to True. :param ofile: Output file handle. :return: ParseQuality class instance filled with the result data. """ total_linkages = len(ref_parses) # in gold standard total_parse_quality = ParseQuality() if len(ref_parses) != len(test_parses): raise EvalError( "Error: files don't contain same parses. " "Number of sentences missmatch. Ref={}, Test={}".format( len(ref_parses), len(test_parses))) for ref_parse, test_parse in zip(ref_parses, test_parses): if ref_parse[PARSE_SENTENCE] != test_parse[PARSE_SENTENCE]: raise EvalError( "Error: Something went wrong. Sentences missmatch." + ref_parse[PARSE_SENTENCE] + "\n" + test_parse[PARSE_SENTENCE]) pq = parse_quality(test_parse[PARSE_LINK_SET], ref_parse[PARSE_LINK_SET]) pq.ignored += test_parse[PARSE_IGNORED] if verbose: print(ParseQuality.text(pq), file=sys.stdout) total_parse_quality += pq if total_linkages > 1: total_parse_quality /= float(total_linkages) print(ParseQuality.text(total_parse_quality), file=ofile) return total_parse_quality
def _on_dict_file(self, dict_file_path: str, args: list) -> None: """ Callback method which is called for each dictionary file. :param dict_file_path: Path to a .dict file. :param args: Argument list. :return: None """ self._total_metrics, self._total_quality = ParseMetrics( ), ParseQuality() self._total_files = 0 try: dict_path = os.path.split(dict_file_path)[0] corp_path = args[DICT_ARG_CORP] dest_path = args[DICT_ARG_OUTP] dest_path += str(dict_path[len(args[DICT_ARG_DICT]):]) # If BIT_LOC_LANG is set the language subdirectory is created in destination directory grmr_path = dest_path if self._options & BIT_LOC_LANG else self._grammar_root # Create new LG dictionary using .dict file and template directory with the rest of mandatory files. lang_path = create_grammar_dir(dict_file_path, grmr_path, self._template_dir, self._options) if os.path.isfile(corp_path): self._on_corpus_file(corp_path, [dest_path, lang_path] + args) elif os.path.isdir(corp_path): traverse_dir_tree( corp_path, "", [self._on_corpus_file, dest_path, lang_path] + args, [self._on_corp_dir, dest_path, lang_path] + args, True) # If output format is set to ULL if not self._options & BIT_OUTPUT: # stat_suffix = "2" if (self._options & BIT_LG_EXE) == BIT_LG_EXE else "" stat_path = dest_path + "/" + os.path.split( corp_path)[1] + ".stat" #+ stat_suffix # Write statistics summary to a file self._save_stat(stat_path, self._total_metrics, self._total_quality) # Invoke on_statistics() event handler if self._is_dir_dict and self._event_handler is not None: self._event_handler.on_statistics( (dict_path.split("/"))[::-1], self._total_metrics, self._total_quality) except Exception as err: print("_on_dict_file(): " + str(type(err)) + ": " + str(err)) self._total_dicts += 1
def test_on_statistics(self): pm, pq = ParseMetrics(), ParseQuality() pm.sentences, pq.sentences = 10, 10 pm.average_parsed_ratio = Decimal("0.6") pq.quality = Decimal("0.4") self.dboard.on_statistics([ "connectors-DRK-connectors", "LG_ANY_all_parses", "POC-English-NoAmb-LEFT-WALL+period" ], pm, pq)
def on_statistics(self, nodes: list, metrics: ParseMetrics, quality: ParseQuality): # Return if dashboard is not configured. if self._config is None: return # row_key, col_key = None, None row_ind, col_ind = None, None try: # Get row and column keys row_key = self._config[CONF_ROW_KEY].format(*nodes) col_key = self._config[CONF_COL_KEY].format(*nodes) # Get row and column indexes row_ind = self._config[CONF_ROW_IND][row_key] col_ind = self._config[CONF_COL_IND][col_key] except IndexError as err: print("on_statatistics(): IndexError: " + str(err)) return except KeyError as err: print("on_statatistics(): KeyError: " + str(err)) return for row in row_ind: for col in col_ind: val_str = None try: # Get value key string by column index val_str = self._config[CONF_VAL_KEYS][col].format( nodes=nodes, parseability=metrics.parseability_str(metrics), parsequality=quality.parse_quality_str(quality), PQA=PQA_str(metrics, quality)) except IndexError as err: print("on_statatistics():2: IndexError: " + str(err)) continue except KeyError as err: print("on_statatistics():2: KeyError: " + str(err)) continue # Put value into the table self.set_cell_by_indexes(row, col, val_str)
def test_grammar_cfg(conf_path: str) -> (Decimal, Decimal, Decimal): """ Test grammar using configuration(s) from a JSON file :param conf_path: Path to a configuration file :return: Tuple (ParseMetrics, ParseQuality) of the last processed test. """ pm, pq = ParseMetrics(), ParseQuality() try: cfgman = JsonFileConfigManager(conf_path) # dboard = HTMLFileDashboard(cfgman) dboard = TextFileDashboard(cfgman) if len( cfgman.get_config("", "dash-board")) else None parser = LGInprocParser() # Get configuration parameters config = cfgman.get_config("", "grammar-tester") # Create GrammarTester instance tester = GrammarTester(handle_path_string(config[0][CONF_GRMR_PATH]), handle_path_string(config[0][CONF_TMPL_PATH]), config[0][CONF_LNK_LIMIT], parser, dboard) # Config file may have multiple configurations for one component for cfg in config: # Run grammar test pm, pq = tester.test(handle_path_string(cfg[CONF_DICT_PATH]), handle_path_string(cfg[CONF_CORP_PATH]), handle_path_string(cfg[CONF_DEST_PATH]), handle_path_string(cfg[CONF_REFR_PATH]), get_options(cfg)) # Save dashboard data to whatever source the dashboard is bounded to dboard.update_dashboard() # print(pm.text(pm)) except Exception as err: print(str(err)) finally: return pm.parseability(pm), pq.parse_quality(pm), PQA(pm, pq)
def _save_stat(stat_path: str, metrics: ParseMetrics, quality: ParseQuality) -> None: """ Save statistic estimation results into a file. :param stat_path: Path to file. :param metrics: ParseMetrics class pointer. :param quality: ParseQulality class pointer. :return: None """ stat_file_handle = None try: stat_file_handle = sys.stdout if stat_path is None else open( stat_path, "w", encoding="utf-8") print(ParseMetrics.text(metrics), file=stat_file_handle) print(ParseQuality.text(quality), file=stat_file_handle) print("PQA:\t{0:2.2f}%".format(( metrics.average_parsed_ratio / metrics.sentences * quality.quality / quality.sentences * Decimal('100.0')) if metrics.sentences else 0.0), file=stat_file_handle) except IOError as err: print("IOError: " + str(err)) except FileNotFoundError as err: print("FileNotFoundError: " + str(err)) except OSError as err: print("OSError: " + str(err)) except Exception as err: print("Exception: " + str(err)) finally: if stat_file_handle is not None and stat_file_handle != sys.stdout: stat_file_handle.close()
def parse_quality(test_set: set, ref_set: set) -> ParseQuality: """ Estimate parse quality :param test_set: Set of links being tested. :param ref_set: Reference set of links :return: ParseQuality instance filled with calculated values. """ pq = ParseQuality() len_ref = len(ref_set) if len_ref > 0: pq.total = len_ref pq.missing = len(ref_set - test_set) pq.extra = len(test_set - ref_set) pq.quality = Decimal(len(test_set & ref_set)) / Decimal(len_ref) return pq
def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \ -> (ParseMetrics, ParseQuality): """ Link Grammar API parser invokation routine. :param dict_path: Dictionary file or directory path. :param corpus_path: Corpus file or directory path. :param output_path: Output file or directory path. :param ref_path: Reference file or directory path. :param options: Bit field. See `optconst.py` for details :return: Tuple (ParseMetrics, ParseQuality) """ input_file_handle = None output_file_handle = None ref_parses = [] # Sentence statistics variables total_metrics, total_quality = ParseMetrics(), ParseQuality() sentence_count = 0 # number of sentences in the corpus print("Info: Parsing a corpus file: '" + corpus_path + "'") print("Info: Using dictionary: '" + dict_path + "'") if output_path is not None: print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'") else: print("Info: Output file name is not specified. Parses are redirected to 'stdout'.") try: if options & BIT_PARSE_QUALITY and ref_path is not None: try: data = load_ull_file(ref_path) ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False) except Exception as err: print("Exception: " + str(err)) link_line = re.compile(r"\A[0-9].+") po = ParseOptions(min_null_count=0, max_null_count=999) po.linkage_limit = self._linkage_limit di = Dictionary(dict_path) input_file_handle = open(corpus_path) output_file_handle = sys.stdout if output_path is None \ else open(output_path+get_output_suffix(options), "w") for line in input_file_handle: # Filter out links when ULL parses are used as input if options & BIT_ULL_IN > 0 and link_line.match(line): continue # Skip empty lines to get proper statistics estimation and skip commented lines if len(line.strip()) < 1: # or line.startswith("#"): continue # Tokenize and parse the sentence sent = Sentence(line, di, po) linkages = sent.parse() sent_metrics, sent_quality = ParseMetrics(), ParseQuality() linkage_count = 0 for linkage in linkages: # Only the first linkage is counted. if linkage_count == 1: break if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM: print(linkage.diagram(), file=output_file_handle) elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT: print(linkage.postscript(), file=output_file_handle) elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE: print(linkage.constituent_tree(), file=output_file_handle) elif not (options & BIT_OUTPUT): tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options, output_file_handle) # Print ULL formated parses print_output(tokens, links, options, output_file_handle) # Calculate parseability sent_metrics += parse_metrics(prepare_tokens(tokens, options)) # Calculate parse quality if the option is set if options & BIT_PARSE_QUALITY and len(ref_parses): sent_quality += parse_quality(get_link_set(tokens, links, options), ref_parses[sentence_count][1]) linkage_count += 1 assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0" assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0" total_metrics += sent_metrics total_quality += sent_quality # if not linkage_count: # sent_metrics.completely_unparsed_ratio += 1 sentence_count += 1 total_metrics.sentences = sentence_count total_quality.sentences = sentence_count # Prevent interleaving "Dictionary close" messages ParseOptions(verbosity=0) except LG_DictionaryError as err: print("LG_DictionaryError: " + str(err)) except LG_Error as err: print("LG_Error: " + str(err)) except IOError as err: print("IOError: " + str(err)) except FileNotFoundError as err: print("FileNotFoundError: " + str(err)) finally: if input_file_handle is not None: input_file_handle.close() if output_file_handle is not None and output_file_handle != sys.stdout: output_file_handle.close() return total_metrics, total_quality
def compare_ull_files(test_path, ref_file, verbose, ignore_left_wall) -> ParseQuality: """ Initiate evaluation process for one or multiple files. :param test_path: Path to file(s) to be tested. :param ref_file: Path to reference file(s). :param verbose: Boolean value which eables intermediate result output if set to True. :param ignore_left_wall: Boolean value which tells the script to ignore LEFT-WALL and period links if set to True. :return: ParseQuality class instance holding parse quality results for the whole corpus (all files if test_path is a directory name). """ total_parse_quality = ParseQuality() total_file_count = 0 def evaluate(test_file): """ Callback evaluation function """ nonlocal total_parse_quality nonlocal total_file_count print("\nComparing parses:") print("-----------------") print("File being tested: '" + test_file + "'") print("Reference file : '" + ref_file + "'") suffix = "" if test_file[-1] != "2" else "2" out_file = test_file + ".stat" + suffix print("Result file : '" + out_file + "'") mode = "a" if os.path.isfile(out_file) else "w" try: ref_data = load_ull_file(ref_file) ref_parses = get_parses(ref_data, ignore_left_wall) test_data = load_ull_file(test_file) test_parses = get_parses(test_data, ignore_left_wall) with open(out_file, mode) as ofile: print("Reference file : '" + ref_file + "'", file=ofile) total_parse_quality += eval_parses(test_parses, ref_parses, verbose, ofile) total_file_count += 1 except IOError as err: print("IOError: " + str(err)) except Exception as err: print("Exception: " + str(err)) try: # If specified name is a file. if os.path.isfile(test_path): evaluate(test_path) # If specified name is a directory. elif os.path.isdir(test_path): traverse_dir(test_path, ".ull2", evaluate, None, True) # If file or directory does not exist. else: raise ("Error: File or directory '" + test_path + "' does not exist.") if total_file_count > 1: total_parse_quality /= float(total_file_count) except IOError as err: print("IOError: " + str(err)) except Exception as err: print("Exception: " + str(err)) finally: return total_parse_quality