Ejemplo n.º 1
0
def print_score(good, total, label):
    """\
    Prints the accuracy score given the number of correctly predicted and
    all items, plus a label.
    """
    score = good / float(total)
    log_info('Score (%s): %d / %d = %f' % (label, good, total, score))
Ejemplo n.º 2
0
def evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                 oov_part):
    """\
    Out-of-vocabulary evaluation
    """
    log_info('Loading known lemmas and forms from: ' + oov_test_file)
    train = DataSet()
    train.load_from_arff(oov_test_file)
    if oov_part < 1:
        log_info('Using only %f-part of the file.' % oov_part)
        train = train.subset(0, int(round(oov_part * len(train))), copy=False)
    known_forms = {i[target_attr].lower() for i in train}
    known_lemmas = {i[source_attr].lower() for i in train}
    oov_forms = [
        1 if i[target_attr].lower() not in known_forms else 0 for i in data
    ]
    oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0 for i in data]
    data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms)
    data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas)
    oov_forms_count = sum(oov_forms)
    oov_forms_good = count_correct(data, target_attr, forms_attr,
                                   lambda i: i['OOV_FORM'])
    print_score(oov_forms_good, oov_forms_count, 'OOV forms')
    oov_lemmas_count = sum(oov_lemmas)
    oov_lemmas_good = count_correct(data, target_attr, forms_attr,
                                    lambda i: i['OOV_LEMMA'])
    print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
Ejemplo n.º 3
0
def print_score(good, total, label):
    """\
    Prints the accuracy score given the number of correctly predicted and
    all items, plus a label.
    """
    score = good / float(total)
    log_info('Score (%s): %d / %d = %f' % (label, good, total, score))
Ejemplo n.º 4
0
def evaluate_oov(data, source_attr, target_attr, forms_attr,
                 oov_test_file, oov_part):
    """\
    Out-of-vocabulary evaluation
    """
    log_info('Loading known lemmas and forms from: ' + oov_test_file)
    train = DataSet()
    train.load_from_arff(oov_test_file)
    if oov_part < 1:
        log_info('Using only %f-part of the file.' % oov_part)
        train = train.subset(0, int(round(oov_part * len(train))), copy=False)
    known_forms = {i[target_attr].lower() for i in train}
    known_lemmas = {i[source_attr].lower() for i in train}
    oov_forms = [1 if i[target_attr].lower() not in known_forms else 0
                 for i in data]
    oov_lemmas = [1 if i[source_attr] not in known_lemmas else 0
                  for i in data]
    data.add_attrib(Attribute('OOV_FORM', 'numeric'), oov_forms)
    data.add_attrib(Attribute('OOV_LEMMA', 'numeric'), oov_lemmas)
    oov_forms_count = sum(oov_forms)
    oov_forms_good = count_correct(data, target_attr, forms_attr,
                                   lambda i: i['OOV_FORM'])
    print_score(oov_forms_good, oov_forms_count, 'OOV forms')
    oov_lemmas_count = sum(oov_lemmas)
    oov_lemmas_good = count_correct(data, target_attr, forms_attr,
                                    lambda i: i['OOV_LEMMA'])
    print_score(oov_lemmas_good, oov_lemmas_count, 'OOV lemmas')
Ejemplo n.º 5
0
def combine_subsets(data, attribs, up_to_size):
    """\
    Combine all subsets of the given attribute list, up to given
    size.
    """
    for size in range(2, up_to_size + 1):
        for attr_set in combinations(attribs, size):
            log_info('Combining %s...' % '+'.join(attr_set))
            concat_attrib(data, attr_set, divider='|', nonempty=True)
Ejemplo n.º 6
0
def combine_subsets(data, attribs, up_to_size):
    """\
    Combine all subsets of the given attribute list, up to given
    size.
    """
    for size in range(2, up_to_size + 1):
        for attr_set in combinations(attribs, size):
            log_info('Combining %s...' % '+'.join(attr_set))
            concat_attrib(data, attr_set, divider='|', nonempty=True)
Ejemplo n.º 7
0
def get_features(model_file, output_file):
    """\
    """
    m = Model.load_from_file(model_file)
    labels = m.data_headers.get_attrib(m.class_attr).labels
    feats = m.vectorizer.get_feature_names()
    fh = codecs.open(output_file, 'w', 'UTF-8')
    for i, label in enumerate(labels):
        log_info('Enumerating features for label %d (\'%s\')' % (i, label))
        coefs = m.classifier.coef_[i]
        nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0]
        print >> fh, 'LABEL == %s' % label
        for f, c in sorted(nonzero, key=itemgetter(1), reverse=True):
            print >> fh, '%s - %f' % (f, c)
        print >> fh, "\n\n"
    fh.close()
Ejemplo n.º 8
0
def get_features(model_file, output_file):
    """\
    """
    m = Model.load_from_file(model_file)
    labels = m.data_headers.get_attrib(m.class_attr).labels
    feats = m.vectorizer.get_feature_names()
    fh = codecs.open(output_file, 'w', 'UTF-8')
    for i, label in enumerate(labels):
        log_info('Enumerating features for label %d (\'%s\')' % (i, label))
        coefs = m.classifier.coef_[i]
        nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0]
        print >> fh, 'LABEL == %s' % label
        for f, c in sorted(nonzero, key=itemgetter(1), reverse=True):
            print >> fh, '%s - %f' % (f, c)
        print >> fh, "\n\n"
    fh.close()
Ejemplo n.º 9
0
def run_training(work_dir, config_file, train_file, model_file,
                 test_file=None, classif_file=None, memory=MEMORY,
                 name='train'):
    """\
    Run the model training.
    """
    # initialization from the configuration file
    _, ext = os.path.splitext(config_file)
    # load configuration from a pickle (we're already in the working directory)
    if ext == '.pickle':
        fh = open(config_file, mode='rb')
        cfg = pickle.load(fh)
        fh.close()
        demarshal_lambda(cfg, 'filter_attr')
        demarshal_lambda(cfg, 'postprocess')
    # load by running Python code (make paths relative to working directory)
    else:
        config_file = os.path.join(work_dir, config_file)
        cfg = Config(config_file)
    # training
    if cfg.get('unfold_pattern'):
        pattern = cfg['unfold_pattern']
        del cfg['unfold_pattern']
        unfold_key = cfg.get('unfold_key', 'unfold_key')
        cfgs = cfg.unfold_lists(pattern, unfold_key)
        for cfg in cfgs:
            key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key])
            create_job(cfg, name + '-' + key, work_dir, train_file, model_file,
                       test_file, classif_file, memory)
        return
    if cfg.get('divide_func'):
        model = SplitModel(cfg)
        model.train(train_file, work_dir, memory)
    else:
        model = Model(cfg)
        model.train(train_file)
    # evaluation
    if test_file is not None and classif_file is not None:
        if ext != '.pickle':  # this means we're not in the working directory
            classif_file = os.path.join(work_dir, classif_file)
        log_info('Evaluation on file: ' + test_file)
        score = model.evaluate(test_file, classif_file=classif_file)
        log_info('Score: ' + str(score))
    # save the model
    if ext != '.pickle':  # we need to make the path relative to work_dir
        model_file = os.path.join(work_dir, model_file)
    model.save_to_file(model_file)
Ejemplo n.º 10
0
def get_stats(data_file, train_file, source_attr, target_attr):
    """\
    """
    data = DataSet()
    log_info('Loading data from %s...' % data_file)
    data.load_from_arff(data_file)
    print_feat(data, lambda a, b: True, 'total')
    print_feat(data, lambda _, i: not regex.match(r'^\p{P}', i[source_attr]),
               'excluding punctuation')
    print_feat(data, lambda _, i: i[source_attr].lower() != \
               i[target_attr].lower(), 'inflected forms')
    if train_file is not None:
        log_info('Loading known data from %s...' % train_file)
        train = DataSet()
        train.load_from_arff(train_file)
        known = {i[target_attr].lower() for i in train}
        print_feat(data, lambda _, i: not i[target_attr].lower() in known,
                   'unknown')
Ejemplo n.º 11
0
def pairwise_bootstrap(file1, file2, gold_attr, pred_attr, cmp_func, iters):
    d1, d2 = DataSet(), DataSet()
    log_info('Loading File1: %s' % file1)
    d1.load_from_arff(file1)
    log_info('Loading File2: %s' % file2)
    d2.load_from_arff(file2)
    gold = d1.attrib_as_vect(gold_attr)
    p1 = d1.attrib_as_vect(pred_attr)
    p2 = d2.attrib_as_vect(pred_attr)
    p1_better, p2_better, ties = 0, 0, 0
    for i in xrange(iters):
        sample = rnd.randint(0, len(gold), len(gold))
        s_p1_good = sum(1 if cmp_func(gold[i], p1[i]) else 0 for i in sample)
        s_p2_good = sum(1 if cmp_func(gold[i], p2[i]) else 0 for i in sample)
        log_info('Round %d: File1 - %2.2f vs. File2 - %2.2f' %
                 (i, float(s_p1_good) / len(gold) * 100,
                  float(s_p2_good) / len(gold) * 100))
        if s_p1_good > s_p2_good:
            p1_better += 1
        elif s_p2_good > s_p1_good:
            p2_better += 1
        else:
            ties += 1
    print ('File1 better: %d (%2.2f) | File2 better: %d (%2.2f) |' +
           ' ties: %d (%2.2f)') % (p1_better, float(p1_better) / iters * 100,
                                   p2_better, float(p2_better) / iters * 100,
                                   ties, float(ties) / iters * 100,)
Ejemplo n.º 12
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr,
                     oov_test_file, oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Ejemplo n.º 13
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                     oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Ejemplo n.º 14
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai')
    show_help = False
    annot_errors = False
    gold = None
    predicted = 'PREDICTED'
    ignore_case = False
    for opt, arg in opts:
        if opt == '-g':
            gold = arg
        elif opt == '-p':
            predicted = arg
        elif opt == '-a':
            annot_errors = True
        elif opt == '-i':
            ignore_case = True
    # display help and exit
    if len(filenames) != 2 or not gold or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if ignore_case:
        cmp_func = lambda a, b: a.lower() != b.lower()
    else:
        cmp_func = lambda a, b: a != b
    if annot_errors:
        log_info('Annotating errors...')
        err_ind = [
            'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data
        ]
        data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind)
    else:
        log_info('Selecting errors...')
        data = data[lambda _, i: cmp_func(i[gold], i[predicted])]
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Ejemplo n.º 15
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], "g:p:ai")
    show_help = False
    annot_errors = False
    gold = None
    predicted = "PREDICTED"
    ignore_case = False
    for opt, arg in opts:
        if opt == "-g":
            gold = arg
        elif opt == "-p":
            predicted = arg
        elif opt == "-a":
            annot_errors = True
        elif opt == "-i":
            ignore_case = True
    # display help and exit
    if len(filenames) != 2 or not gold or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info("Loading data: " + filename_in)
    data.load_from_arff(filename_in)
    if ignore_case:
        cmp_func = lambda a, b: a.lower() != b.lower()
    else:
        cmp_func = lambda a, b: a != b
    if annot_errors:
        log_info("Annotating errors...")
        err_ind = ["ERR" if cmp_func(i[gold], i[predicted]) else "" for i in data]
        data.add_attrib(Attribute("ERROR_IND", "string"), err_ind)
    else:
        log_info("Selecting errors...")
        data = data[lambda _, i: cmp_func(i[gold], i[predicted])]
    log_info("Saving data: " + filename_out)
    data.save_to_arff(filename_out)
Ejemplo n.º 16
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:')
    show_help = False
    combine_cng = False
    subsets = []
    neighbors = []
    substrs = []
    for opt, arg in opts:
        if opt == '-c':
            combine_cng = True
        elif opt == '-s':
            sub_len, attr = arg.split(':', 1)
            substrs.append((int(sub_len), attr))
        elif opt == '-a':
            size, attrs = arg.split(':', 1)
            subsets.append((int(size), re.split(r'[, ]+', attrs)))
        elif opt == '-n':
            shift, attrs = arg.split(':', 1)
            neighbors.append((int(shift), re.split(r'[, ]+', attrs)))
    # display help and exit
    if len(filenames) != 2 or not (combine_cng or substrs or
                                   subsets or neighbors) or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if substrs:
        for (sub_len, attr) in substrs:
            log_info(('Adding substrings from the %s of %s ' +
                      'up to %d characters long ...') %
                     (('beginning' if sub_len > 0 else 'end'),
                      attr, abs(sub_len)))
            add_substr_attributes(data, sub_len, attr)
    if combine_cng:
        log_info('Combining case, number, gender ...')
        combine_tag_num_gen_cas(data)
    if subsets:
        for (set_size, set_attrs) in subsets:
            log_info('Combining up to %d attributes from [%s] ...' %
                     (set_size, ','.join(set_attrs)))
            combine_subsets(data, set_attrs, set_size)
    if neighbors:
        for (shift, attrs) in neighbors:
            log_info('Adding neighbor %d\'s attributes [%s] ...' %
                     (shift, ','.join(attrs)))
            add_neighbor_attributes(data, shift, attrs)
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Ejemplo n.º 17
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:')
    show_help = False
    combine_cng = False
    subsets = []
    neighbors = []
    substrs = []
    for opt, arg in opts:
        if opt == '-c':
            combine_cng = True
        elif opt == '-s':
            sub_len, attr = arg.split(':', 1)
            substrs.append((int(sub_len), attr))
        elif opt == '-a':
            size, attrs = arg.split(':', 1)
            subsets.append((int(size), re.split(r'[, ]+', attrs)))
        elif opt == '-n':
            shift, attrs = arg.split(':', 1)
            neighbors.append((int(shift), re.split(r'[, ]+', attrs)))
    # display help and exit
    if len(filenames) != 2 or not (combine_cng or substrs or
                                   subsets or neighbors) or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if substrs:
        for (sub_len, attr) in substrs:
            log_info(('Adding substrings from the %s of %s ' +
                      'up to %d characters long ...') %
                     (('beginning' if sub_len > 0 else 'end'),
                      attr, abs(sub_len)))
            add_substr_attributes(data, sub_len, attr)
    if combine_cng:
        log_info('Combining case, number, gender ...')
        combine_tag_num_gen_cas(data)
    if subsets:
        for (set_size, set_attrs) in subsets:
            log_info('Combining up to %d attributes from [%s] ...' %
                     (set_size, ','.join(set_attrs)))
            combine_subsets(data, set_attrs, set_size)
    if neighbors:
        for (shift, attrs) in neighbors:
            log_info('Adding neighbor %d\'s attributes [%s] ...' %
                     (shift, ','.join(attrs)))
            add_neighbor_attributes(data, shift, attrs)
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)