Esempio n. 1
0
def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                 oov_part):
    """\
    Out-of-vocabulary evaluation
    """
    log_info('Loading known lemmas and forms from: ' + oov_test_file)
    train = DataSet()
    train.load_from_arff(oov_test_file)
    if oov_part < 1:
        log_info('Using only %f-part of the file.' % oov_part)
        train = train.subset(0, int(round(oov_part * len(train))), copy=False)
    known_forms = {i[target_attr].lower() for i in train}
    known_lemmas = {i[source_attr].lower() for i in train}
    oov_forms = [
        1 if i[target_attr].lower() not in known_forms else 0 for i in data
    ]
    oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data]
    data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms)
    data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas)
    oov_forms_count = sum(oov_forms)
    oov_forms_good = count_correct(data, target_attr, forms_attr,
                                   lambda i: i['OOV_FORM'])
    print_score(oov_forms_good, oov_forms_count, 'OOV forms')
    oov_lemmas_count = sum(oov_lemmas)
    oov_lemmas_good = count_correct(data, target_attr, forms_attr,
                                    lambda i: i['OOV_LEMMA'])
    print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
Esempio n. 2
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                     oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Esempio n. 3
0
def inflect(data, attrib_lemma, attrib_infl, attrib_form):
    """\
    Given a data set with the lemma and inflection rules attributes, this will
    add the resulting inflected forms as a new attribute.
    """
    attrib_lemma = data.get_attrib(attrib_lemma).name
    attrib_infl = data.get_attrib(attrib_infl).name
    values = [flect.flect.inflect(inst[attrib_lemma], inst[attrib_infl])
              for inst in data]
    data.add_attrib(Attribute(attrib_form, 'string'), values)
Esempio n. 4
0
def add_substr_attributes(data, sub_len, attrib):
    """\
    Add substrings of the given attribute as separate attributes.
    The sub_len parameter controls the maximum length and
    the position of the substrings (negative = end, positive = beginning)
    """
    attrib = data.get_attrib(attrib).name
    for l in xrange(1, abs(sub_len) + 1):
        values = []
        for i, inst in enumerate(data):
            try:
                values.append(inst[attrib][:l].lower() if sub_len > 0 else [attrib][-l:].lower())
            except Exception as e:
                log_warn('Fatal error at instance %d : ' % i + unicode(inst))
                raise e

        new_name = attrib + '_SUBSTR' + ('+' if sub_len > 0 else '-') + str(l)
        data.add_attrib(Attribute(new_name, 'string'), values)
Esempio n. 5
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai')
    show_help = False
    annot_errors = False
    gold = None
    predicted = 'PREDICTED'
    ignore_case = False
    for opt, arg in opts:
        if opt == '-g':
            gold = arg
        elif opt == '-p':
            predicted = arg
        elif opt == '-a':
            annot_errors = True
        elif opt == '-i':
            ignore_case = True
    # display help and exit
    if len(filenames) != 2 or not gold or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if ignore_case:
        cmp_func = lambda a, b: a.lower() != b.lower()
    else:
        cmp_func = lambda a, b: a != b
    if annot_errors:
        log_info('Annotating errors...')
        err_ind = [
            'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data
        ]
        data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind)
    else:
        log_info('Selecting errors...')
        data = data[lambda _, i: cmp_func(i[gold], i[predicted])]
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Esempio n. 6
0
def add_neighbor_attributes(data, shift, attribs):
    """\
    Add the given attributes of a neighbor in the distance given by shift.
    The values are set as unknown if the neighbor does not exist within the
    same sentence.
    """
    attrib_list = [data.get_attrib(a).name for a in attribs]
    for attrib in attrib_list:
        values = []
        for idx, inst in enumerate(data):
            if idx + shift >= 0 and idx + shift < len(data):
                ngb_inst = data[idx + shift]
                values.append(ngb_inst[attrib]
                              if ngb_inst[SENT_ID_ATTR] == inst[SENT_ID_ATTR]
                              else None)
            else:
                values.append(None)
        new_name = 'NEIGHBOR' + ('+' if shift > 0 else '') + \
                str(shift) + '_' + attrib
        data.add_attrib(Attribute(new_name, 'string'), values)
Esempio n. 7
0
def concat_attrib(data, attribs, new_name=None, divider='', nonempty=False):
    """\
    Concatenate the selected attributes to form a new one (attributes
    should be specified by name or number).

    If an attribute is not defined, its value is replaced by '?'.
    """
    attrib_list = [data.get_attrib(a).name for a in attribs]
    if new_name is None:
        new_name = '+'.join(attrib_list)
    values = []
    for inst in data:
        if nonempty:
            val = [inst[a] if inst[a] is not None else '' for a in attrib_list]
        else:
            val = [inst[a] if inst[a] is not None else '?' for a in attrib_list]
        if not nonempty or '' not in val:
            values.append(divider.join(val))
        else:
            values.append(None)
    data.add_attrib(Attribute(new_name, 'string'), values)