def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part): """\ Out-of-vocabulary evaluation """ log_info('Loading known lemmas and forms from: ' + oov_test_file) train = DataSet() train.load_from_arff(oov_test_file) if oov_part < 1: log_info('Using only %f-part of the file.' % oov_part) train = train.subset(0, int(round(oov_part * len(train))), copy=False) known_forms = {i[target_attr].lower() for i in train} known_lemmas = {i[source_attr].lower() for i in train} oov_forms = [ 1 if i[target_attr].lower() not in known_forms else 0 for i in data ] oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data] data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms) data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas) oov_forms_count = sum(oov_forms) oov_forms_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_FORM']) print_score(oov_forms_good, oov_forms_count, 'OOV forms') oov_lemmas_count = sum(oov_lemmas) oov_lemmas_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_LEMMA']) print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
def test_models(file_in, file_out, model_files, source_attr, target_attr, oov_test_file, oov_part, pos_attr, test_indiv): """\ Test all the given models on the selected file and save the target. If oov_test_file is set, performs also OOV evaluation. If test_pos is True, prints detailed results for various POSs. """ # load testing data log_info('Loading data: ' + file_in) data = DataSet() data.load_from_arff(file_in) forms = data[source_attr] # apply all models for model_num, model_file in enumerate(model_files, start=1): model = Model.load_from_file(model_file) log_info('Applying model: ' + model_file) rules = model.classify(data) output_attr = 'OUTPUT_M' + str(model_num) data.add_attrib(Attribute(output_attr, 'string'), rules) if test_indiv: good = count_correct(data, model.class_attr, output_attr) print_score(good, len(data), 'Model accuracy') forms = [inflect(form, rule) for form, rule in zip(forms, rules)] forms_attr = 'FORMS_M' + str(model_num) data.add_attrib(Attribute(forms_attr, 'string'), forms) # test the final performance log_info('Evaluating...') good = count_correct(data, target_attr, forms_attr) print_score(good, len(data), 'ALL') # evaluate without punctuation evaluate_nopunct(data, source_attr, target_attr, forms_attr) # evaluate forms different from lemma evaluate_nolemma(data, source_attr, target_attr, forms_attr) # load training data for OOV tests, evaluate on OOV if oov_test_file: evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part) # test on different POSes if pos_attr: evaluate_poses(data, target_attr, forms_attr, pos_attr) # save the classification results log_info('Saving data: ' + file_out) data.save_to_arff(file_out)
def inflect(data, attrib_lemma, attrib_infl, attrib_form): """\ Given a data set with the lemma and inflection rules attributes, this will add the resulting inflected forms as a new attribute. """ attrib_lemma = data.get_attrib(attrib_lemma).name attrib_infl = data.get_attrib(attrib_infl).name values = [flect.flect.inflect(inst[attrib_lemma], inst[attrib_infl]) for inst in data] data.add_attrib(Attribute(attrib_form, 'string'), values)
def add_substr_attributes(data, sub_len, attrib): """\ Add substrings of the given attribute as separate attributes. The sub_len parameter controls the maximum length and the position of the substrings (negative = end, positive = beginning) """ attrib = data.get_attrib(attrib).name for l in xrange(1, abs(sub_len) + 1): values = [] for i, inst in enumerate(data): try: values.append(inst[attrib][:l].lower() if sub_len > 0 else [attrib][-l:].lower()) except Exception as e: log_warn('Fatal error at instance %d : ' % i + unicode(inst)) raise e new_name = attrib + '_SUBSTR' + ('+' if sub_len > 0 else '-') + str(l) data.add_attrib(Attribute(new_name, 'string'), values)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai') show_help = False annot_errors = False gold = None predicted = 'PREDICTED' ignore_case = False for opt, arg in opts: if opt == '-g': gold = arg elif opt == '-p': predicted = arg elif opt == '-a': annot_errors = True elif opt == '-i': ignore_case = True # display help and exit if len(filenames) != 2 or not gold or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if ignore_case: cmp_func = lambda a, b: a.lower() != b.lower() else: cmp_func = lambda a, b: a != b if annot_errors: log_info('Annotating errors...') err_ind = [ 'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data ] data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind) else: log_info('Selecting errors...') data = data[lambda _, i: cmp_func(i[gold], i[predicted])] log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)
def add_neighbor_attributes(data, shift, attribs): """\ Add the given attributes of a neighbor in the distance given by shift. The values are set as unknown if the neighbor does not exist within the same sentence. """ attrib_list = [data.get_attrib(a).name for a in attribs] for attrib in attrib_list: values = [] for idx, inst in enumerate(data): if idx + shift >= 0 and idx + shift < len(data): ngb_inst = data[idx + shift] values.append(ngb_inst[attrib] if ngb_inst[SENT_ID_ATTR] == inst[SENT_ID_ATTR] else None) else: values.append(None) new_name = 'NEIGHBOR' + ('+' if shift > 0 else '') + \ str(shift) + '_' + attrib data.add_attrib(Attribute(new_name, 'string'), values)
def concat_attrib(data, attribs, new_name=None, divider='', nonempty=False): """\ Concatenate the selected attributes to form a new one (attributes should be specified by name or number). If an attribute is not defined, its value is replaced by '?'. """ attrib_list = [data.get_attrib(a).name for a in attribs] if new_name is None: new_name = '+'.join(attrib_list) values = [] for inst in data: if nonempty: val = [inst[a] if inst[a] is not None else '' for a in attrib_list] else: val = [inst[a] if inst[a] is not None else '?' for a in attrib_list] if not nonempty or '' not in val: values.append(divider.join(val)) else: values.append(None) data.add_attrib(Attribute(new_name, 'string'), values)