def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part): """\ Out-of-vocabulary evaluation """ log_info('Loading known lemmas and forms from: ' + oov_test_file) train = DataSet() train.load_from_arff(oov_test_file) if oov_part < 1: log_info('Using only %f-part of the file.' % oov_part) train = train.subset(0, int(round(oov_part * len(train))), copy=False) known_forms = {i[target_attr].lower() for i in train} known_lemmas = {i[source_attr].lower() for i in train} oov_forms = [ 1 if i[target_attr].lower() not in known_forms else 0 for i in data ] oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data] data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms) data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas) oov_forms_count = sum(oov_forms) oov_forms_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_FORM']) print_score(oov_forms_good, oov_forms_count, 'OOV forms') oov_lemmas_count = sum(oov_lemmas) oov_lemmas_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_LEMMA']) print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
def convert(in_file, out_file, feat_no, use_feat_names, cpos_chars): """\ This does the conversion to ARFF. """ fh_in = file_stream(in_file) buf = [] sent_id = 1 word_id = 1 for line in fh_in: line = line.rstrip('\r\n') if not line: sent_id += 1 word_id = 1 continue # split the CoNLL format, removing unwanted stuff _, form, lemma, _, pos, _, feat, _ = line.split('\t', 7) # copy attributes inst = { 'Form': form, 'Lemma': lemma, 'Tag_POS': pos, 'word_id': word_id, 'sent_id': in_file + '-' + str(sent_id) } # computing form-lemma diff (edit script) escr_front, escr_midback = edit_script(lemma, form) inst['LemmaFormDiff_Front'] = escr_front inst['LemmaFormDiff_Back'] = escr_midback # lemma suffixes for i in xrange(1, 9): inst['LemmaSuff_' + str(i)] = lemma[-i:] # coarse POS inst['Tag_CPOS'] = pos[:cpos_chars] # POS features feats = feat.split('|', feat_no - 1) feats += [''] * (feat_no - len(feats)) for feat_ord, feat in enumerate(feats, start=1): if use_feat_names: feat_name, feat_val = feat.split('=', 1) inst['Tag_' + feat_name] = feat_val else: inst['Tag_FEAT' + str(feat_ord)] = feat # increase word number word_id += 1 # save the instance to the buffer buf.append(inst) # write this all out as ARFF data = DataSet() attr_order = [ 'sent_id', 'word_id', 'Lemma', 'Form', 'LemmaFormDiff_Front', 'LemmaFormDiff_Back' ] for i in xrange(1, 9): attr_order.append('LemmaSuff_' + str(i)) attr_order.extend(['Tag_POS', 'Tag_CPOS']) data.load_from_dict(buf, {'word_id': 'numeric'}, attr_order) data.save_to_arff(out_file)
def get_stats(data_file, train_file, source_attr, target_attr): """\ """ data = DataSet() log_info('Loading data from %s...' % data_file) data.load_from_arff(data_file) print_feat(data, lambda a, b: True, 'total') print_feat(data, lambda _, i: not regex.match(r'^\p{P}', i[source_attr]), 'excluding punctuation') print_feat(data, lambda _, i: i[source_attr].lower() != \ i[target_attr].lower(), 'inflected forms') if train_file is not None: log_info('Loading known data from %s...' % train_file) train = DataSet() train.load_from_arff(train_file) known = {i[target_attr].lower() for i in train} print_feat(data, lambda _, i: not i[target_attr].lower() in known, 'unknown')
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:') show_help = False combine_cng = False subsets = [] neighbors = [] substrs = [] for opt, arg in opts: if opt == '-c': combine_cng = True elif opt == '-s': sub_len, attr = arg.split(':', 1) substrs.append((int(sub_len), attr)) elif opt == '-a': size, attrs = arg.split(':', 1) subsets.append((int(size), re.split(r'[, ]+', attrs))) elif opt == '-n': shift, attrs = arg.split(':', 1) neighbors.append((int(shift), re.split(r'[, ]+', attrs))) # display help and exit if len(filenames) != 2 or not (combine_cng or substrs or subsets or neighbors) or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if substrs: for (sub_len, attr) in substrs: log_info(('Adding substrings from the %s of %s ' + 'up to %d characters long ...') % (('beginning' if sub_len > 0 else 'end'), attr, abs(sub_len))) add_substr_attributes(data, sub_len, attr) if combine_cng: log_info('Combining case, number, gender ...') combine_tag_num_gen_cas(data) if subsets: for (set_size, set_attrs) in subsets: log_info('Combining up to %d attributes from [%s] ...' % (set_size, ','.join(set_attrs))) combine_subsets(data, set_attrs, set_size) if neighbors: for (shift, attrs) in neighbors: log_info('Adding neighbor %d\'s attributes [%s] ...' % (shift, ','.join(attrs))) add_neighbor_attributes(data, shift, attrs) log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)
def test_models(file_in, file_out, model_files, source_attr, target_attr, oov_test_file, oov_part, pos_attr, test_indiv): """\ Test all the given models on the selected file and save the target. If oov_test_file is set, performs also OOV evaluation. If test_pos is True, prints detailed results for various POSs. """ # load testing data log_info('Loading data: ' + file_in) data = DataSet() data.load_from_arff(file_in) forms = data[source_attr] # apply all models for model_num, model_file in enumerate(model_files, start=1): model = Model.load_from_file(model_file) log_info('Applying model: ' + model_file) rules = model.classify(data) output_attr = 'OUTPUT_M' + str(model_num) data.add_attrib(Attribute(output_attr, 'string'), rules) if test_indiv: good = count_correct(data, model.class_attr, output_attr) print_score(good, len(data), 'Model accuracy') forms = [inflect(form, rule) for form, rule in zip(forms, rules)] forms_attr = 'FORMS_M' + str(model_num) data.add_attrib(Attribute(forms_attr, 'string'), forms) # test the final performance log_info('Evaluating...') good = count_correct(data, target_attr, forms_attr) print_score(good, len(data), 'ALL') # evaluate without punctuation evaluate_nopunct(data, source_attr, target_attr, forms_attr) # evaluate forms different from lemma evaluate_nolemma(data, source_attr, target_attr, forms_attr) # load training data for OOV tests, evaluate on OOV if oov_test_file: evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part) # test on different POSes if pos_attr: evaluate_poses(data, target_attr, forms_attr, pos_attr) # save the classification results log_info('Saving data: ' + file_out) data.save_to_arff(file_out)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai') show_help = False annot_errors = False gold = None predicted = 'PREDICTED' ignore_case = False for opt, arg in opts: if opt == '-g': gold = arg elif opt == '-p': predicted = arg elif opt == '-a': annot_errors = True elif opt == '-i': ignore_case = True # display help and exit if len(filenames) != 2 or not gold or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if ignore_case: cmp_func = lambda a, b: a.lower() != b.lower() else: cmp_func = lambda a, b: a != b if annot_errors: log_info('Annotating errors...') err_ind = [ 'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data ] data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind) else: log_info('Selecting errors...') data = data[lambda _, i: cmp_func(i[gold], i[predicted])] log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)