def print_score(good, total, label): """\ Prints the accuracy score given the number of correctly predicted and all items, plus a label. """ score = good / float(total) log_info('Score (%s): %d / %d = %f' % (label, good, total, score))
def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part): """\ Out-of-vocabulary evaluation """ log_info('Loading known lemmas and forms from: ' + oov_test_file) train = DataSet() train.load_from_arff(oov_test_file) if oov_part < 1: log_info('Using only %f-part of the file.' % oov_part) train = train.subset(0, int(round(oov_part * len(train))), copy=False) known_forms = {i[target_attr].lower() for i in train} known_lemmas = {i[source_attr].lower() for i in train} oov_forms = [ 1 if i[target_attr].lower() not in known_forms else 0 for i in data ] oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data] data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms) data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas) oov_forms_count = sum(oov_forms) oov_forms_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_FORM']) print_score(oov_forms_good, oov_forms_count, 'OOV forms') oov_lemmas_count = sum(oov_lemmas) oov_lemmas_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_LEMMA']) print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
def print_score(good, total, label): """\ Prints the accuracy score given the number of correctly predicted and all items, plus a label. """ score = good / float(total) log_info('Score (%s): %d / %d = %f' % (label, good, total, score))
def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part): """\ Out-of-vocabulary evaluation """ log_info('Loading known lemmas and forms from: ' + oov_test_file) train = DataSet() train.load_from_arff(oov_test_file) if oov_part < 1: log_info('Using only %f-part of the file.' % oov_part) train = train.subset(0, int(round(oov_part * len(train))), copy=False) known_forms = {i[target_attr].lower() for i in train} known_lemmas = {i[source_attr].lower() for i in train} oov_forms = [1 if i[target_attr].lower() not in known_forms else 0 for i in data] oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data] data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms) data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas) oov_forms_count = sum(oov_forms) oov_forms_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_FORM']) print_score(oov_forms_good, oov_forms_count, 'OOV forms') oov_lemmas_count = sum(oov_lemmas) oov_lemmas_good = count_correct(data, target_attr, forms_attr, lambda i: i['OOV_LEMMA']) print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
def combine_subsets(data, attribs, up_to_size): """\ Combine all subsets of the given attribute list, up to given size. """ for size in range(2, up_to_size + 1): for attr_set in combinations(attribs, size): log_info('Combining %s...' % '+'.join(attr_set)) concat_attrib(data, attr_set, divider='|', nonempty=True)
def combine_subsets(data, attribs, up_to_size): """\ Combine all subsets of the given attribute list, up to given size. """ for size in range(2, up_to_size + 1): for attr_set in combinations(attribs, size): log_info('Combining %s...' % '+'.join(attr_set)) concat_attrib(data, attr_set, divider='|', nonempty=True)
def get_features(model_file, output_file): """\ """ m = Model.load_from_file(model_file) labels = m.data_headers.get_attrib(m.class_attr).labels feats = m.vectorizer.get_feature_names() fh = codecs.open(output_file, 'w', 'UTF-8') for i, label in enumerate(labels): log_info('Enumerating features for label %d (\'%s\')' % (i, label)) coefs = m.classifier.coef_[i] nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0] print >> fh, 'LABEL == %s' % label for f, c in sorted(nonzero, key=itemgetter(1), reverse=True): print >> fh, '%s - %f' % (f, c) print >> fh, "\n\n" fh.close()
def get_features(model_file, output_file): """\ """ m = Model.load_from_file(model_file) labels = m.data_headers.get_attrib(m.class_attr).labels feats = m.vectorizer.get_feature_names() fh = codecs.open(output_file, 'w', 'UTF-8') for i, label in enumerate(labels): log_info('Enumerating features for label %d (\'%s\')' % (i, label)) coefs = m.classifier.coef_[i] nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0] print >> fh, 'LABEL == %s' % label for f, c in sorted(nonzero, key=itemgetter(1), reverse=True): print >> fh, '%s - %f' % (f, c) print >> fh, "\n\n" fh.close()
def run_training(work_dir, config_file, train_file, model_file, test_file=None, classif_file=None, memory=MEMORY, name='train'): """\ Run the model training. """ # initialization from the configuration file _, ext = os.path.splitext(config_file) # load configuration from a pickle (we're already in the working directory) if ext == '.pickle': fh = open(config_file, mode='rb') cfg = pickle.load(fh) fh.close() demarshal_lambda(cfg, 'filter_attr') demarshal_lambda(cfg, 'postprocess') # load by running Python code (make paths relative to working directory) else: config_file = os.path.join(work_dir, config_file) cfg = Config(config_file) # training if cfg.get('unfold_pattern'): pattern = cfg['unfold_pattern'] del cfg['unfold_pattern'] unfold_key = cfg.get('unfold_key', 'unfold_key') cfgs = cfg.unfold_lists(pattern, unfold_key) for cfg in cfgs: key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key]) create_job(cfg, name + '-' + key, work_dir, train_file, model_file, test_file, classif_file, memory) return if cfg.get('divide_func'): model = SplitModel(cfg) model.train(train_file, work_dir, memory) else: model = Model(cfg) model.train(train_file) # evaluation if test_file is not None and classif_file is not None: if ext != '.pickle': # this means we're not in the working directory classif_file = os.path.join(work_dir, classif_file) log_info('Evaluation on file: ' + test_file) score = model.evaluate(test_file, classif_file=classif_file) log_info('Score: ' + str(score)) # save the model if ext != '.pickle': # we need to make the path relative to work_dir model_file = os.path.join(work_dir, model_file) model.save_to_file(model_file)
def get_stats(data_file, train_file, source_attr, target_attr): """\ """ data = DataSet() log_info('Loading data from %s...' % data_file) data.load_from_arff(data_file) print_feat(data, lambda a, b: True, 'total') print_feat(data, lambda _, i: not regex.match(r'^\p{P}', i[source_attr]), 'excluding punctuation') print_feat(data, lambda _, i: i[source_attr].lower() != \ i[target_attr].lower(), 'inflected forms') if train_file is not None: log_info('Loading known data from %s...' % train_file) train = DataSet() train.load_from_arff(train_file) known = {i[target_attr].lower() for i in train} print_feat(data, lambda _, i: not i[target_attr].lower() in known, 'unknown')
def pairwise_bootstrap(file1, file2, gold_attr, pred_attr, cmp_func, iters): d1, d2 = DataSet(), DataSet() log_info('Loading File1: %s' % file1) d1.load_from_arff(file1) log_info('Loading File2: %s' % file2) d2.load_from_arff(file2) gold = d1.attrib_as_vect(gold_attr) p1 = d1.attrib_as_vect(pred_attr) p2 = d2.attrib_as_vect(pred_attr) p1_better, p2_better, ties = 0, 0, 0 for i in xrange(iters): sample = rnd.randint(0, len(gold), len(gold)) s_p1_good = sum(1 if cmp_func(gold[i], p1[i]) else 0 for i in sample) s_p2_good = sum(1 if cmp_func(gold[i], p2[i]) else 0 for i in sample) log_info('Round %d: File1 - %2.2f vs. File2 - %2.2f' % (i, float(s_p1_good) / len(gold) * 100, float(s_p2_good) / len(gold) * 100)) if s_p1_good > s_p2_good: p1_better += 1 elif s_p2_good > s_p1_good: p2_better += 1 else: ties += 1 print ('File1 better: %d (%2.2f) | File2 better: %d (%2.2f) |' + ' ties: %d (%2.2f)') % (p1_better, float(p1_better) / iters * 100, p2_better, float(p2_better) / iters * 100, ties, float(ties) / iters * 100,)
def test_models(file_in, file_out, model_files, source_attr, target_attr, oov_test_file, oov_part, pos_attr, test_indiv): """\ Test all the given models on the selected file and save the target. If oov_test_file is set, performs also OOV evaluation. If test_pos is True, prints detailed results for various POSs. """ # load testing data log_info('Loading data: ' + file_in) data = DataSet() data.load_from_arff(file_in) forms = data[source_attr] # apply all models for model_num, model_file in enumerate(model_files, start=1): model = Model.load_from_file(model_file) log_info('Applying model: ' + model_file) rules = model.classify(data) output_attr = 'OUTPUT_M' + str(model_num) data.add_attrib(Attribute(output_attr, 'string'), rules) if test_indiv: good = count_correct(data, model.class_attr, output_attr) print_score(good, len(data), 'Model accuracy') forms = [inflect(form, rule) for form, rule in zip(forms, rules)] forms_attr = 'FORMS_M' + str(model_num) data.add_attrib(Attribute(forms_attr, 'string'), forms) # test the final performance log_info('Evaluating...') good = count_correct(data, target_attr, forms_attr) print_score(good, len(data), 'ALL') # evaluate without punctuation evaluate_nopunct(data, source_attr, target_attr, forms_attr) # evaluate forms different from lemma evaluate_nolemma(data, source_attr, target_attr, forms_attr) # load training data for OOV tests, evaluate on OOV if oov_test_file: evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part) # test on different POSes if pos_attr: evaluate_poses(data, target_attr, forms_attr, pos_attr) # save the classification results log_info('Saving data: ' + file_out) data.save_to_arff(file_out)
def test_models(file_in, file_out, model_files, source_attr, target_attr, oov_test_file, oov_part, pos_attr, test_indiv): """\ Test all the given models on the selected file and save the target. If oov_test_file is set, performs also OOV evaluation. If test_pos is True, prints detailed results for various POSs. """ # load testing data log_info('Loading data: ' + file_in) data = DataSet() data.load_from_arff(file_in) forms = data[source_attr] # apply all models for model_num, model_file in enumerate(model_files, start=1): model = Model.load_from_file(model_file) log_info('Applying model: ' + model_file) rules = model.classify(data) output_attr = 'OUTPUT_M' + str(model_num) data.add_attrib(Attribute(output_attr, 'string'), rules) if test_indiv: good = count_correct(data, model.class_attr, output_attr) print_score(good, len(data), 'Model accuracy') forms = [inflect(form, rule) for form, rule in zip(forms, rules)] forms_attr = 'FORMS_M' + str(model_num) data.add_attrib(Attribute(forms_attr, 'string'), forms) # test the final performance log_info('Evaluating...') good = count_correct(data, target_attr, forms_attr) print_score(good, len(data), 'ALL') # evaluate without punctuation evaluate_nopunct(data, source_attr, target_attr, forms_attr) # evaluate forms different from lemma evaluate_nolemma(data, source_attr, target_attr, forms_attr) # load training data for OOV tests, evaluate on OOV if oov_test_file: evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part) # test on different POSes if pos_attr: evaluate_poses(data, target_attr, forms_attr, pos_attr) # save the classification results log_info('Saving data: ' + file_out) data.save_to_arff(file_out)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai') show_help = False annot_errors = False gold = None predicted = 'PREDICTED' ignore_case = False for opt, arg in opts: if opt == '-g': gold = arg elif opt == '-p': predicted = arg elif opt == '-a': annot_errors = True elif opt == '-i': ignore_case = True # display help and exit if len(filenames) != 2 or not gold or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if ignore_case: cmp_func = lambda a, b: a.lower() != b.lower() else: cmp_func = lambda a, b: a != b if annot_errors: log_info('Annotating errors...') err_ind = [ 'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data ] data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind) else: log_info('Selecting errors...') data = data[lambda _, i: cmp_func(i[gold], i[predicted])] log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], "g:p:ai") show_help = False annot_errors = False gold = None predicted = "PREDICTED" ignore_case = False for opt, arg in opts: if opt == "-g": gold = arg elif opt == "-p": predicted = arg elif opt == "-a": annot_errors = True elif opt == "-i": ignore_case = True # display help and exit if len(filenames) != 2 or not gold or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info("Loading data: " + filename_in) data.load_from_arff(filename_in) if ignore_case: cmp_func = lambda a, b: a.lower() != b.lower() else: cmp_func = lambda a, b: a != b if annot_errors: log_info("Annotating errors...") err_ind = ["ERR" if cmp_func(i[gold], i[predicted]) else "" for i in data] data.add_attrib(Attribute("ERROR_IND", "string"), err_ind) else: log_info("Selecting errors...") data = data[lambda _, i: cmp_func(i[gold], i[predicted])] log_info("Saving data: " + filename_out) data.save_to_arff(filename_out)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:') show_help = False combine_cng = False subsets = [] neighbors = [] substrs = [] for opt, arg in opts: if opt == '-c': combine_cng = True elif opt == '-s': sub_len, attr = arg.split(':', 1) substrs.append((int(sub_len), attr)) elif opt == '-a': size, attrs = arg.split(':', 1) subsets.append((int(size), re.split(r'[, ]+', attrs))) elif opt == '-n': shift, attrs = arg.split(':', 1) neighbors.append((int(shift), re.split(r'[, ]+', attrs))) # display help and exit if len(filenames) != 2 or not (combine_cng or substrs or subsets or neighbors) or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if substrs: for (sub_len, attr) in substrs: log_info(('Adding substrings from the %s of %s ' + 'up to %d characters long ...') % (('beginning' if sub_len > 0 else 'end'), attr, abs(sub_len))) add_substr_attributes(data, sub_len, attr) if combine_cng: log_info('Combining case, number, gender ...') combine_tag_num_gen_cas(data) if subsets: for (set_size, set_attrs) in subsets: log_info('Combining up to %d attributes from [%s] ...' % (set_size, ','.join(set_attrs))) combine_subsets(data, set_attrs, set_size) if neighbors: for (shift, attrs) in neighbors: log_info('Adding neighbor %d\'s attributes [%s] ...' % (shift, ','.join(attrs))) add_neighbor_attributes(data, shift, attrs) log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)
def main(): """\ Main application entry: parse command line and run the test. """ opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:') show_help = False combine_cng = False subsets = [] neighbors = [] substrs = [] for opt, arg in opts: if opt == '-c': combine_cng = True elif opt == '-s': sub_len, attr = arg.split(':', 1) substrs.append((int(sub_len), attr)) elif opt == '-a': size, attrs = arg.split(':', 1) subsets.append((int(size), re.split(r'[, ]+', attrs))) elif opt == '-n': shift, attrs = arg.split(':', 1) neighbors.append((int(shift), re.split(r'[, ]+', attrs))) # display help and exit if len(filenames) != 2 or not (combine_cng or substrs or subsets or neighbors) or show_help: display_usage() sys.exit(1) # run the training filename_in, filename_out = filenames data = DataSet() log_info('Loading data: ' + filename_in) data.load_from_arff(filename_in) if substrs: for (sub_len, attr) in substrs: log_info(('Adding substrings from the %s of %s ' + 'up to %d characters long ...') % (('beginning' if sub_len > 0 else 'end'), attr, abs(sub_len))) add_substr_attributes(data, sub_len, attr) if combine_cng: log_info('Combining case, number, gender ...') combine_tag_num_gen_cas(data) if subsets: for (set_size, set_attrs) in subsets: log_info('Combining up to %d attributes from [%s] ...' % (set_size, ','.join(set_attrs))) combine_subsets(data, set_attrs, set_size) if neighbors: for (shift, attrs) in neighbors: log_info('Adding neighbor %d\'s attributes [%s] ...' % (shift, ','.join(attrs))) add_neighbor_attributes(data, shift, attrs) log_info('Saving data: ' + filename_out) data.save_to_arff(filename_out)