Example #1
0
def get_features(model_file, output_file):
    """\
    """
    m = Model.load_from_file(model_file)
    labels = m.data_headers.get_attrib(m.class_attr).labels
    feats = m.vectorizer.get_feature_names()
    fh = codecs.open(output_file, 'w', 'UTF-8')
    for i, label in enumerate(labels):
        log_info('Enumerating features for label %d (\'%s\')' % (i, label))
        coefs = m.classifier.coef_[i]
        nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0]
        print >> fh, 'LABEL == %s' % label
        for f, c in sorted(nonzero, key=itemgetter(1), reverse=True):
            print >> fh, '%s - %f' % (f, c)
        print >> fh, "\n\n"
    fh.close()
Example #2
0
def get_features(model_file, output_file):
    """\
    """
    m = Model.load_from_file(model_file)
    labels = m.data_headers.get_attrib(m.class_attr).labels
    feats = m.vectorizer.get_feature_names()
    fh = codecs.open(output_file, 'w', 'UTF-8')
    for i, label in enumerate(labels):
        log_info('Enumerating features for label %d (\'%s\')' % (i, label))
        coefs = m.classifier.coef_[i]
        nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0]
        print >> fh, 'LABEL == %s' % label
        for f, c in sorted(nonzero, key=itemgetter(1), reverse=True):
            print >> fh, '%s - %f' % (f, c)
        print >> fh, "\n\n"
    fh.close()
Example #3
0
def run_training(work_dir, config_file, train_file, model_file,
                 test_file=None, classif_file=None, memory=MEMORY,
                 name='train'):
    """\
    Run the model training.
    """
    # initialization from the configuration file
    _, ext = os.path.splitext(config_file)
    # load configuration from a pickle (we're already in the working directory)
    if ext == '.pickle':
        fh = open(config_file, mode='rb')
        cfg = pickle.load(fh)
        fh.close()
        demarshal_lambda(cfg, 'filter_attr')
        demarshal_lambda(cfg, 'postprocess')
    # load by running Python code (make paths relative to working directory)
    else:
        config_file = os.path.join(work_dir, config_file)
        cfg = Config(config_file)
    # training
    if cfg.get('unfold_pattern'):
        pattern = cfg['unfold_pattern']
        del cfg['unfold_pattern']
        unfold_key = cfg.get('unfold_key', 'unfold_key')
        cfgs = cfg.unfold_lists(pattern, unfold_key)
        for cfg in cfgs:
            key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key])
            create_job(cfg, name + '-' + key, work_dir, train_file, model_file,
                       test_file, classif_file, memory)
        return
    if cfg.get('divide_func'):
        model = SplitModel(cfg)
        model.train(train_file, work_dir, memory)
    else:
        model = Model(cfg)
        model.train(train_file)
    # evaluation
    if test_file is not None and classif_file is not None:
        if ext != '.pickle':  # this means we're not in the working directory
            classif_file = os.path.join(work_dir, classif_file)
        log_info('Evaluation on file: ' + test_file)
        score = model.evaluate(test_file, classif_file=classif_file)
        log_info('Score: ' + str(score))
    # save the model
    if ext != '.pickle':  # we need to make the path relative to work_dir
        model_file = os.path.join(work_dir, model_file)
    model.save_to_file(model_file)
Example #4
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr,
                     oov_test_file, oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Example #5
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                     oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)