Exemple #1
0
def main(corpus, rules, options, iteration, display):
    """
    Finds all tokens that match a given rule, using that to rate the rules
    performance. Rates rule performance by totalling up the tokens that
    match the rule, and comparing that to how many of the tokens in that set
    are considered to be a PN.
    Args:
        corpus = Set of all lines from the corpus.
        rules = RuleSet object of all currently used rules.
        options = values pulled from the configuration file.

    Returns:
        None

    Raises:
        None

    """

    alpha = options.alpha
    k = options.k
    accept_threshold = options.accept_threshold

    i = 0
    for rule in rules:
        names = namesfromrule.main(corpus, rule)
        rateRulePerformance(names, rule, alpha, k, accept_threshold)

        i += 1
        display.update_progress_bar((len(rules) * (iteration - 1)) + i,
                                    len(rules) * options.iterations)
Exemple #2
0
def get_new_names(corpus, names, rules):
    """
    Meant to use the provided ruleset to scan the corpus for new names.
    It will then return the names in quesiton, which will be used
    to generate more rules.

	Basically, it grabs all tokens from the corpus matching the rules in
	question and then return them as a set.  The names parameter lets you
	specify tokens that are already recognized as names, allowing you to
	retrieve only new name results.

    Args:
        corpus (set): Set of Token objects representing the entire Garshana
					  corpus.
        names (set): Set of Tokens already recognized as names.
        rules (set): Set of Rule objects used to find new names

    Returns:
        new_names (set): Set of Token objects

    Raises:
        None

    """

    new_names = set()

    for rule in rules:
        results = namesfromrule.main(corpus, rule)
        for name in results:
            if name not in names:
                new_names.add(name)

    return new_names
Exemple #3
0
def main(tokens, rules):
    """
    This will update the strength of all tokens it is given, using
        the rules it is given.
    Args:
        tokens (set): Set object of Token objects.
        rules (set): Set object of Rule objects.

    Returns:
        None

    Raises:
        ValueError

    """

    for rule in rules:
        names = namesfromrule.main(tokens, rule)

        for name in names:
            # Raise ValueError("attempted to apply same rule to a token twice!")

            # Use statistical rules to calculate the new probability
            # In other words, the probability that all other applicable rules in addition
            #  to the current one are wrong
            initialprob = name.name_probability

            if ((initialprob < 0) and (initialprob > 1)):
                raise ValueError(
                    "Token \"" + str(name) +
                    "\" has impossible name probability (v < 0 or v > 1): " +
                    str(initialprob))

            addedprob = rule.strength

            if ((addedprob < 0) and (addedprob > 1)):
                raise ValueError(
                    "Rule \"" + str(rule) +
                    "\" has impossible strength rating (str < 0 or str > 1): "
                    + str(addedprob))

            newprob = 1 - ((1 - initialprob) * (1 - addedprob))

            if ((newprob < 0) and (newprob > 1)):
                raise ValueError("Generated impossible name probability: " +
                                 str(newprob))

            name.name_probability = newprob
Exemple #4
0
def test_namesfromrule_main():
    """

    """

    corpus_file = 'tests/test_corpus.csv'
    seed_rules_file = 'tests/test_seed_rules.csv'

    display = Display()

    display.start()
    corpus = ner.import_corpus(corpus_file, display)
    display.finish()

    display.start()
    seed_rules = ner.import_seed_rules(seed_rules_file, display)
    display.finish()

    expected = Token('aa-aa', 'bb-bb', 'cc-cc', Token.Type.personal_name)
    for rule in seed_rules:
        results = namesfromrule.main(corpus, rule)
        for name in results:
            assert name == expected
Exemple #5
0
def assess_strength(rules, corpus, config):
    """
	Evaluates the accuracy of the strength rating of the passed-in rules.  This
	is useful because the ner model will generate rules in an unsupervised
	fashion.  This function gets used to evaluate the performance of that
	process.

    Args:
        rules (set): A set of Rule objects to be evaluated
        corpus (set): A set of Token objects, representing the entire Garshana
					  corpus.

    Returns:
        None

    Raises:
        None

    """

    bad_rules = 0
    bad_context = 0
    bad_spelling = 0

    total_context = 0
    total_spelling = 0
    total_delta = 0

    est_false_positives = 0

    print("rule performance:")
    print("calculating...", end='\r')

    i = 0
    cols = {
        'Iteration'    : [],
        'Rule'         : [],
        'Type'         : [],
        'Strength'     : [],
        'True Strength': [],
        'Occurrences'  : []
    }
    output = pd.DataFrame(data=cols)

    x_vals = []
    y_vals = []
    rule_num = 1

    for rule in rules:
        names = namesfromrule.main(corpus, rule)
        real_names = 0
        total_names = len(names)

        for token in names:
            if token.type == Token.Type.personal_name:
                real_names += 1

        if total_names == 0:
            true_strength = 0
        else:
            true_strength = real_names / total_names

        delta = abs(true_strength - rule.strength)
        total_delta += delta

        x_vals.append(rule_num)
        rule_num += 1
        y_vals.append(delta)

        if rule.type == Rule.Type.spelling:
            total_spelling += 1
        else:
            total_context += 1

        #if a rule is more than 20% from its true value, it is 'bad'
        if delta > 0.2:
            bad_rules += 1
            if rule.type == Rule.Type.spelling:
                bad_spelling += 1
            else:
                bad_context += 1

        i += 1
        output.loc[i, 'Iteration']     = rule.iteration
        output.loc[i, 'Rule']          = rule.contents
        output.loc[i, 'Type']          = rule.type.name
        output.loc[i, 'Strength']      = rule.strength
        output.loc[i, 'True Strength'] = true_strength
        output.loc[i, 'Occurrences']   = rule.occurrences

    output_path = config['path'].format(
        'ner',
        time.strftime('%Y%m%d_%H%M'),
        'output.csv'
    )
    output.to_csv(path_or_buf=output_path)

    print("               ", end='\r')
    print("percentage of bad rules:    {}%".format(
        100 * bad_rules / len(rules)
    ))
    print("percentage of bad context:  {}%".format(
        100 * bad_context / total_context
    ))
    print("percentage of bad spelling: {}%".format(
        100 * bad_spelling / total_spelling
    ))
    print("average delta value:        {}%".format(
        100 * total_delta / len(rules)
    ))

    plt.xlabel('Rules')
    plt.ylabel('Delta')
    plt.title('Plot of Delta per Rule')
    plt.plot(x_vals, y_vals, 'ro')
    plt.axis([min(x_vals), max(x_vals), min(y_vals), max(y_vals)])
    plt.show()

    sort_y = sorted(y_vals)
    plt.xlabel('')
    plt.ylabel('Delta')
    plt.title('Delta Sorted')
    plt.plot(x_vals, sort_y, 'ro')
    plt.axis([min(x_vals), max(x_vals), min(sort_y), max(sort_y)])
    plt.show()