Exemple #1
0
def run(output_dir, feature_num, mode):

    feature_num = int(feature_num)
    taxo_res = TaxonomyResources(freq_fpaths, isa_fpaths)
    taxo_features = TaxonomyFeatures(taxo_res, relations_fpath=train_relations_fpath)

    ensure_dir(output_dir)
    features = ["hyper_in_hypo_i","hypo2hyper_substract", "freq_substract", "in_weight_substract", "length_substract",
            "hypo2hyper_s_substract","hypo2hyper_max2_substract"]
    features = features[:feature_num]

    if mode == "gridsearch":
        #  grid search is only supported for SVC
        method = "SVC"
        hc = SuperTaxi(join(output_dir, "SVC-grid-search"), method="SVC", features=features, overwrite=True)
        clf = hc.grid_search_svc(taxo_features.relations, test=TEST_GRIDSEARCH)
        return

    for method in METHODS:
        try:
            classifier_dir = join(output_dir, method)
            print("\n", method.upper(), "\n", "="*50)
            hc = SuperTaxi(classifier_dir, method=method, features=features, overwrite=True)
            if mode == "train":
                clf = hc.train(taxo_features.relations)
                hc._print_clf_info()
            elif mode == "cv":
                hc.crossval(taxo_features.relations)
            else:
                print("Error: unrecognised mode %s" % mode)
        except:
            print(format_exc())
Exemple #2
0
def evaluate_on_trial_taxo():
    relations_fpath = join(
        RES_DIR, "relations.csv"
    )  # assuming features "hyper_in_hypo_i" and "hypo2hyper_substract"
    taxo_fpath = relations_fpath + "-taxo.csv"
    print("Relations:", relations_fpath)
    print("Unpruned taxonomy:", taxo_fpath)

    taxo_features = TaxonomyFeatures(TaxonomyResources(),
                                     relations_fpath=relations_fpath,
                                     lang="en")
    taxo_predict = TaxonomyPredictor(taxo_features)
    taxo_predict.predict_by_global_threshold(threshold=0,
                                             field="hypo2hyper_substract",
                                             or_correct_predict=False)
    taxo_predict.predict_by_global_threshold(threshold=0,
                                             field="hyper_in_hypo_i",
                                             or_correct_predict=True)
    taxo_predict.save(taxo_fpath)
    taxo_predict.evaluate(field="correct_predict")

    for max_knn in [1, 2, 3, 5]:
        taxo_knn_fpath = relations_fpath + "-taxo-knn" + str(max_knn) + ".csv"
        taxo_predict.predict_by_local_threshold(threshold=0,
                                                max_knn=max_knn,
                                                field="hypo2hyper_substract",
                                                or_correct_predict=False)
        taxo_predict.predict_by_global_threshold(threshold=0,
                                                 field="hyper_in_hypo_i",
                                                 or_correct_predict=True)
        taxo_predict.save(taxo_knn_fpath)
        taxo_predict.evaluate(field="correct_predict")
Exemple #3
0
def extract_semeval_taxo(input_voc_pattern, language, mode, classifiers_pattern, test_en):
    #Laedt alle Datensaetze(auch alle Domaenen, aus vocabularies)
    taxo_res_common, taxo_res_domain = load_res(language, mode, test_en)

    for voc_fpath in sorted(glob(input_voc_pattern)):
        for space in [False, True]:
            s = "-space" if space else ""
            relations_fpath = voc_fpath + s + "-relations.csv"
            taxo_fpath = relations_fpath + "-taxo.csv"
            print "\n", voc_fpath, "\n", "="*50
            print "Relations:", relations_fpath
            print "Unpruned taxonomy:", taxo_fpath

            #Laedt domain-datenset und kombiniert sie mit dem allgemeinen Datenset
            taxo_res_domain_voc = get_taxo_res_domain_voc(taxo_res_domain, voc_fpath)
            taxo_res_voc = combine_taxo_res(taxo_res_common, taxo_res_domain_voc)
            taxo_features = TaxonomyFeatures(taxo_res_voc, voc_fpath, lang=language)

            if mode == "simple":
                taxo_features.fill_direct_isas()
                taxo_features.fill_substrings(must_have_space=space)
                taxo_features.hypo2hyper_ratio()
                taxo_predict = TaxonomyPredictor(taxo_features)
                taxo_predict.predict_by_global_threshold(threshold=0, field="hypo2hyper_substract", or_correct_predict=False)
                taxo_predict.predict_by_global_threshold(threshold=0, field="hyper_in_hypo_i", or_correct_predict=True)
                taxo_predict.save(taxo_fpath)

                for max_knn in [1, 2, 3, 5]:
                    #hypo2hyper fuer pattern
                    #hyperinhypoi feur substring
                    taxo_knn_fpath = relations_fpath + "-taxo-knn" + unicode(max_knn) + ".csv"
                    taxo_predict.predict_by_local_threshold(threshold=0, max_knn=max_knn, field="hypo2hyper_substract", or_correct_predict=False)
                    taxo_predict.predict_by_global_threshold(threshold=0, field="hyper_in_hypo_i", or_correct_predict=True)
                    taxo_predict.save(taxo_knn_fpath)

            elif mode == "super":
                taxo_features.fill_super_features()

                for classifier_dir in glob(classifiers_pattern):
                    try:
                        print "Predicting with:", classifier_dir
                        taxo_predict = TaxonomyPredictor(taxo_features)
                        method = taxo_predict.predict_by_classifier(classifier_dir)
                        taxo_predict.save(taxo_fpath + "-" + method + ".csv")
                        taxo_predict.save(taxo_fpath + "-" + method + "-conf.csv", conf=True)
                    except:
                        print format_exc()
Exemple #4
0
def extract_semeval_taxo(input_voc_pattern, language, mode, classifiers_pattern):
    taxo_res_common, taxo_res_domain = load_res(language, mode) 
        
    for voc_fpath in sorted(glob(input_voc_pattern)):
        for space in [False]: #, True]:
            s = "-space" if space else ""
            relations_fpath = voc_fpath + s + "-relations.csv"
            taxo_fpath = relations_fpath + "-taxo.csv"
            print "\n", voc_fpath, "\n", "="*50
            print "Relations:", relations_fpath
            print "Unpruned taxonomy:", taxo_fpath
            
            taxo_res_domain_voc = get_taxo_res_domain_voc(taxo_res_domain, voc_fpath)
            taxo_res_voc = combine_taxo_res(taxo_res_common, taxo_res_domain_voc)
            taxo_features = TaxonomyFeatures(taxo_res_voc, voc_fpath, lang=language)       
            
            if mode == "simple":
                taxo_features.fill_direct_isas()
                taxo_features.fill_substrings(must_have_space=space)
                taxo_features.hypo2hyper_ratio()
                taxo_predict = TaxonomyPredictor(taxo_features)
                taxo_predict.predict_by_global_threshold(threshold=0, field="hypo2hyper_substract", or_correct_predict=False)
                taxo_predict.predict_by_global_threshold(threshold=0, field="hyper_in_hypo_i", or_correct_predict=True)
                taxo_predict.save(taxo_fpath)
            
                for max_knn in [1, 2, 3, 5]:
                    taxo_knn_fpath = relations_fpath + "-taxo-knn" + unicode(max_knn) + ".csv"
                    taxo_predict.predict_by_local_threshold(threshold=0, max_knn=max_knn, field="hypo2hyper_substract", or_correct_predict=False)
                    taxo_predict.predict_by_global_threshold(threshold=0, field="hyper_in_hypo_i", or_correct_predict=True)
                    taxo_predict.save(taxo_knn_fpath)
                    
            elif mode == "super":
                taxo_features.fill_super_features()

                for classifier_dir in glob(classifiers_pattern):
                    try:
                        print "Predicting with:", classifier_dir
                        taxo_predict = TaxonomyPredictor(taxo_features)
                        method = taxo_predict.predict_by_classifier(classifier_dir)
                        taxo_predict.save(taxo_fpath + "-" + method + ".csv")
                        taxo_predict.save(taxo_fpath + "-" + method + "-conf.csv", conf=True)
                    except:
                        print format_exc()
Exemple #5
0
from jnt.isas.taxo import TaxonomyResources, TaxonomyFeatures

relations_fpath = "/Users/alex/tmp/semeval/new/relations.csv"
voc_fpath = "/Users/alex/tmp/semeval/new/voc.csv"
isa_fpath = "/Users/alex/tmp/semeval/new/en_dt.csv-isas.csv"

taxo_res = TaxonomyResources(freq_fpaths=[""], isa_fpaths=[isa_fpath])
taxo_features = TaxonomyFeatures(taxo_res, voc_fpath=voc_fpath, lang='en')
taxo_features.fill_direct_isas()