Esempio n. 1
0
    def test_on_statistics(self):
        pm, pq = ParseMetrics(), ParseQuality()

        pm.sentences, pq.sentences = 10, 10

        pm.average_parsed_ratio = Decimal("0.6")
        pq.quality = Decimal("0.4")

        self.dboard.on_statistics([
            "connectors-DRK-connectors", "LG_ANY_all_parses",
            "POC-English-NoAmb-LEFT-WALL+period"
        ], pm, pq)
    def __init__(self,
                 grmr: str,
                 tmpl: str,
                 limit: int,
                 parser: AbstractFileParserClient,
                 evt_handler: AbstractStatEventHandler = None):

        if parser is None:
            raise GrammarTestError(
                "GrammarTestError: 'parser' argument can not be None.")

        if not isinstance(parser, AbstractFileParserClient):
            raise GrammarTestError(
                "GrammarTestError: 'parser' is not an instance of AbstractFileParserClient"
            )

        if evt_handler is not None and not isinstance(
                evt_handler, AbstractStatEventHandler):
            raise GrammarTestError(
                "ArgumentError: 'evt_handler' is not an instance of AbstractStatEventHandler"
            )

        self._parser = parser
        self._event_handler = evt_handler
        self._grammar_root = grmr
        self._template_dir = tmpl
        self._linkage_limit = limit
        self._options = 0  # options
        self._is_dir_corpus = False
        self._is_dir_dict = False
        self._total_metrics = ParseMetrics()
        self._total_quality = ParseQuality()
        self._total_files = 0
        self._total_dicts = 0
def test_grammar_cfg(conf_path: str) -> (Decimal, Decimal, Decimal):
    """
    Test grammar using configuration(s) from a JSON file

    :param conf_path:   Path to a configuration file
    :return:            Tuple (ParseMetrics, ParseQuality) of the last processed test.
    """
    pm, pq = ParseMetrics(), ParseQuality()

    try:
        cfgman = JsonFileConfigManager(conf_path)
        # dboard = HTMLFileDashboard(cfgman)

        dboard = TextFileDashboard(cfgman) if len(
            cfgman.get_config("", "dash-board")) else None

        parser = LGInprocParser()

        # Get configuration parameters
        config = cfgman.get_config("", "grammar-tester")

        # Create GrammarTester instance
        tester = GrammarTester(handle_path_string(config[0][CONF_GRMR_PATH]),
                               handle_path_string(config[0][CONF_TMPL_PATH]),
                               config[0][CONF_LNK_LIMIT], parser, dboard)

        # Config file may have multiple configurations for one component
        for cfg in config:

            # Run grammar test
            pm, pq = tester.test(handle_path_string(cfg[CONF_DICT_PATH]),
                                 handle_path_string(cfg[CONF_CORP_PATH]),
                                 handle_path_string(cfg[CONF_DEST_PATH]),
                                 handle_path_string(cfg[CONF_REFR_PATH]),
                                 get_options(cfg))

        # Save dashboard data to whatever source the dashboard is bounded to
        dboard.update_dashboard()

        # print(pm.text(pm))

    except Exception as err:
        print(str(err))
    finally:
        return pm.parseability(pm), pq.parse_quality(pm), PQA(pm, pq)
    def _on_dict_file(self, dict_file_path: str, args: list) -> None:
        """
        Callback method which is called for each dictionary file.

        :param dict_file_path: Path to a .dict file.
        :param args: Argument list.
        :return: None
        """
        self._total_metrics, self._total_quality = ParseMetrics(
        ), ParseQuality()
        self._total_files = 0

        try:
            dict_path = os.path.split(dict_file_path)[0]
            corp_path = args[DICT_ARG_CORP]
            dest_path = args[DICT_ARG_OUTP]

            dest_path += str(dict_path[len(args[DICT_ARG_DICT]):])

            # If BIT_LOC_LANG is set the language subdirectory is created in destination directory
            grmr_path = dest_path if self._options & BIT_LOC_LANG else self._grammar_root

            # Create new LG dictionary using .dict file and template directory with the rest of mandatory files.
            lang_path = create_grammar_dir(dict_file_path, grmr_path,
                                           self._template_dir, self._options)

            if os.path.isfile(corp_path):
                self._on_corpus_file(corp_path, [dest_path, lang_path] + args)

            elif os.path.isdir(corp_path):
                traverse_dir_tree(
                    corp_path, "",
                    [self._on_corpus_file, dest_path, lang_path] + args,
                    [self._on_corp_dir, dest_path, lang_path] + args, True)

            # If output format is set to ULL
            if not self._options & BIT_OUTPUT:
                # stat_suffix = "2" if (self._options & BIT_LG_EXE) == BIT_LG_EXE else ""
                stat_path = dest_path + "/" + os.path.split(
                    corp_path)[1] + ".stat"  #+ stat_suffix

                # Write statistics summary to a file
                self._save_stat(stat_path, self._total_metrics,
                                self._total_quality)

                # Invoke on_statistics() event handler
                if self._is_dir_dict and self._event_handler is not None:

                    self._event_handler.on_statistics(
                        (dict_path.split("/"))[::-1], self._total_metrics,
                        self._total_quality)

        except Exception as err:
            print("_on_dict_file(): " + str(type(err)) + ": " + str(err))

        self._total_dicts += 1
Esempio n. 5
0
    def on_statistics(self, nodes: list, metrics: ParseMetrics,
                      quality: ParseQuality):

        # Return if dashboard is not configured.
        if self._config is None:
            return

        # row_key, col_key = None, None
        row_ind, col_ind = None, None

        try:
            # Get row and column keys
            row_key = self._config[CONF_ROW_KEY].format(*nodes)
            col_key = self._config[CONF_COL_KEY].format(*nodes)

            # Get row and column indexes
            row_ind = self._config[CONF_ROW_IND][row_key]
            col_ind = self._config[CONF_COL_IND][col_key]

        except IndexError as err:
            print("on_statatistics(): IndexError: " + str(err))
            return

        except KeyError as err:
            print("on_statatistics(): KeyError: " + str(err))
            return

        for row in row_ind:
            for col in col_ind:

                val_str = None

                try:
                    # Get value key string by column index
                    val_str = self._config[CONF_VAL_KEYS][col].format(
                        nodes=nodes,
                        parseability=metrics.parseability_str(metrics),
                        parsequality=quality.parse_quality_str(quality),
                        PQA=PQA_str(metrics, quality))

                except IndexError as err:
                    print("on_statatistics():2: IndexError: " + str(err))
                    continue

                except KeyError as err:
                    print("on_statatistics():2: KeyError: " + str(err))
                    continue

                # Put value into the table
                self.set_cell_by_indexes(row, col, val_str)
    def _save_stat(stat_path: str, metrics: ParseMetrics,
                   quality: ParseQuality) -> None:
        """
        Save statistic estimation results into a file.

        :param stat_path:   Path to file.
        :param metrics:     ParseMetrics class pointer.
        :param quality:     ParseQulality class pointer.
        :return:            None
        """
        stat_file_handle = None

        try:
            stat_file_handle = sys.stdout if stat_path is None else open(
                stat_path, "w", encoding="utf-8")

            print(ParseMetrics.text(metrics), file=stat_file_handle)
            print(ParseQuality.text(quality), file=stat_file_handle)

            print("PQA:\t{0:2.2f}%".format((
                metrics.average_parsed_ratio / metrics.sentences *
                quality.quality / quality.sentences *
                Decimal('100.0')) if metrics.sentences else 0.0),
                  file=stat_file_handle)

        except IOError as err:
            print("IOError: " + str(err))

        except FileNotFoundError as err:
            print("FileNotFoundError: " + str(err))

        except OSError as err:
            print("OSError: " + str(err))

        except Exception as err:
            print("Exception: " + str(err))

        finally:
            if stat_file_handle is not None and stat_file_handle != sys.stdout:
                stat_file_handle.close()
def parse_metrics(tokens: list) -> ParseMetrics:
    """
    Calculate percentage of successfully linked tokens. Token in square brackets considered to be unlinked.

    :param tokens: List of tokens.
    :return: ParseMetrics
    """
    pm = ParseMetrics()

    end_token = len(tokens)

    # Nothing to calculate if no tokens found
    if end_token == 0:
        return pm

    start_token = 0 if not tokens[0].startswith("###") else 1

    while tokens[end_token - 1].startswith("###") or tokens[
            end_token - 1] == "." or tokens[end_token - 1] == "[.]":
        end_token -= 1

    total = end_token - start_token

    if not total:
        return pm

    # Initialize number of unlinked tokens
    unlinked = 0

    for i in range(start_token, end_token, 1):
        if tokens[i].startswith("["):
            unlinked += 1

    if unlinked == 0:
        pm.completely_parsed_ratio = Decimal("1.0")
        pm.average_parsed_ratio = Decimal("1.0")
    else:
        pm.average_parsed_ratio = Decimal(
            "1.0") - Decimal(unlinked) / Decimal(total)

    if total == unlinked:
        pm.completely_unparsed_ratio = Decimal("1.0")

    # print(pm.text(pm))

    return pm
    def test_string_keys(self):
        nodes = ["AFC", "BZ"]
        print(nodes)
        print(tuple(nodes))

        print("{0[0]}{0[1]}".format(nodes))
        print("{1}--{0}".format(*nodes))

        pm = ParseMetrics()
        pm.average_parsed_ratio = Decimal("0.66")
        pm.sentences = 3
        print("{sentences}>>{sentences}>>{parseability}".format(
            parseability=pm.parseability(pm), sentences=pm.sentences))
        print("{nodes[2]}{nodes[1]}{nodes[0]}".format(nodes=["A", "B", "C"]))
        print("{nodes[1]}>>{sentences}>>{parseability}".format(
            parseability=pm.parseability(pm),
            sentences=pm.sentences,
            nodes=["A", "B", "C"]))
Esempio n. 9
0
    def parse(self, dict_path: str, corpus_path: str, output_path: str, ref_path: str, options: int) \
            -> (ParseMetrics, ParseQuality):
        """
        Link Grammar API parser invokation routine.

        :param dict_path:       Dictionary file or directory path.
        :param corpus_path:     Corpus file or directory path.
        :param output_path:     Output file or directory path.
        :param ref_path:        Reference file or directory path.
        :param options:         Bit field. See `optconst.py` for details
        :return:                Tuple (ParseMetrics, ParseQuality)
        """
        input_file_handle = None
        output_file_handle = None

        ref_parses = []

        # Sentence statistics variables
        total_metrics, total_quality = ParseMetrics(), ParseQuality()

        sentence_count = 0                  # number of sentences in the corpus

        print("Info: Parsing a corpus file: '" + corpus_path + "'")
        print("Info: Using dictionary: '" + dict_path + "'")

        if output_path is not None:
            print("Info: Parses are saved in: '" + output_path+get_output_suffix(options) + "'")
        else:
            print("Info: Output file name is not specified. Parses are redirected to 'stdout'.")

        try:
            if options & BIT_PARSE_QUALITY and ref_path is not None:
                try:
                    data = load_ull_file(ref_path)
                    ref_parses = get_parses(data, (options & BIT_NO_LWALL) == BIT_NO_LWALL, False)

                except Exception as err:
                    print("Exception: " + str(err))

            link_line = re.compile(r"\A[0-9].+")

            po = ParseOptions(min_null_count=0, max_null_count=999)
            po.linkage_limit = self._linkage_limit

            di = Dictionary(dict_path)

            input_file_handle = open(corpus_path)
            output_file_handle = sys.stdout if output_path is None \
                                            else open(output_path+get_output_suffix(options), "w")

            for line in input_file_handle:

                # Filter out links when ULL parses are used as input
                if options & BIT_ULL_IN > 0 and link_line.match(line):
                    continue

                # Skip empty lines to get proper statistics estimation and skip commented lines
                if len(line.strip()) < 1:  # or line.startswith("#"):
                    continue

                # Tokenize and parse the sentence
                sent = Sentence(line, di, po)
                linkages = sent.parse()

                sent_metrics, sent_quality = ParseMetrics(), ParseQuality()
                linkage_count = 0

                for linkage in linkages:

                    # Only the first linkage is counted.
                    if linkage_count == 1:
                        break

                    if (options & BIT_OUTPUT_DIAGRAM) == BIT_OUTPUT_DIAGRAM:
                        print(linkage.diagram(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_POSTSCRIPT) == BIT_OUTPUT_POSTSCRIPT:
                        print(linkage.postscript(), file=output_file_handle)

                    elif (options & BIT_OUTPUT_CONST_TREE) == BIT_OUTPUT_CONST_TREE:
                        print(linkage.constituent_tree(), file=output_file_handle)

                    elif not (options & BIT_OUTPUT):

                        tokens, links = parse_postscript(linkage.postscript().replace("\n", ""), options,
                                                         output_file_handle)

                        # Print ULL formated parses
                        print_output(tokens, links, options, output_file_handle)

                        # Calculate parseability
                        sent_metrics += parse_metrics(prepare_tokens(tokens, options))

                        # Calculate parse quality if the option is set
                        if options & BIT_PARSE_QUALITY and len(ref_parses):
                            sent_quality += parse_quality(get_link_set(tokens, links, options),
                                                          ref_parses[sentence_count][1])

                    linkage_count += 1

                assert sent_metrics.average_parsed_ratio <= 1.0, "sent_metrics.average_parsed_ratio > 1.0"
                assert sent_quality.quality <= 1.0, "sent_quality.quality > 1.0"

                total_metrics += sent_metrics
                total_quality += sent_quality

                # if not linkage_count:
                #     sent_metrics.completely_unparsed_ratio += 1

                sentence_count += 1

            total_metrics.sentences = sentence_count
            total_quality.sentences = sentence_count

            # Prevent interleaving "Dictionary close" messages
            ParseOptions(verbosity=0)

        except LG_DictionaryError as err:
            print("LG_DictionaryError: " + str(err))

        except LG_Error as err:
            print("LG_Error: " + str(err))

        except IOError as err:
            print("IOError: " + str(err))

        except FileNotFoundError as err:
            print("FileNotFoundError: " + str(err))

        finally:
            if input_file_handle is not None:
                input_file_handle.close()

            if output_file_handle is not None and output_file_handle != sys.stdout:
                output_file_handle.close()

            return total_metrics, total_quality