Beispiel #1
0
def run_tests(include_ner_subcat=True):
    ids, utters = get_data("./TestingData/Input1.txt")
    ids, labels = get_data("./TestingData/Annotation1.txt")
    if include_ner_subcat:
        ids, labels_ner = get_data("./TestingData/Ann_NERSub.txt")
    for idx in xrange(len(ids)):
        utter_idx = utters[idx].split()
        labels_idx = labels[idx].split()
        if include_ner_subcat:
            labels_ner_idx = labels_ner[idx].split()
        assert len(utter_idx) == len(
            labels_idx
        ), "mismatch in length of label_length/utter_length in Annotation1.txt at utter %d" % ids[
            idx]
        if include_ner_subcat:
            assert len(utter_idx) == len(
                labels_ner_idx
            ), "mismatch in length of label_length/utter_length in Ann_NERSub.txt at utter %d" % ids[
                idx]
        for label in labels_idx:
            assert label in ALLOWED_LABELS_WITHOUT_NERSUB, "some invalid label %s found in Annotation1.txt at utter %d" % (
                label, ids[idx])
        if include_ner_subcat:
            for label in labels_ner_idx:
                assert label in ALLOWED_LABELS_WITH_NERSUB, "some invalid label %s found in Ann_NERSub.txt at utter %d" % (
                    label, ids[idx])
            for sub_idx in xrange(len(labels_idx)):
                if labels_idx[sub_idx] == "NE":
                    assert labels_ner_idx[sub_idx].startswith(
                        "NE"
                    ) or labels_ner_idx[
                        sub_idx] == "en", "some invalid label %s found in Ann_NERSub.txt at utter %d" % (
                            labels_idx[sub_idx], ids[idx])
Beispiel #2
0
def run_tests(include_ner_subcat = True):
    ids,utters = get_data("./TestingData/Input1.txt")
    ids,labels = get_data("./TestingData/Annotation1.txt")
    if include_ner_subcat:
        ids,labels_ner = get_data("./TestingData/Ann_NERSub.txt")
    for idx in xrange(len(ids)):
        utter_idx = utters[idx].split()
        labels_idx = labels[idx].split()
        if include_ner_subcat:
            labels_ner_idx = labels_ner[idx].split()
        assert len(utter_idx) == len(labels_idx),"mismatch in length of label_length/utter_length in Annotation1.txt at utter %d" % ids[idx]
        if include_ner_subcat:
            assert len(utter_idx) == len(labels_ner_idx),"mismatch in length of label_length/utter_length in Ann_NERSub.txt at utter %d" % ids[idx]
        for label in labels_idx:
            assert label in ALLOWED_LABELS_WITHOUT_NERSUB, "some invalid label %s found in Annotation1.txt at utter %d" % (label,ids[idx])
        if include_ner_subcat:
            for label in labels_ner_idx:
                assert label in ALLOWED_LABELS_WITH_NERSUB, "some invalid label %s found in Ann_NERSub.txt at utter %d" % (label,ids[idx])
            for sub_idx in xrange(len(labels_idx)):
                if labels_idx[sub_idx] == "NE": assert labels_ner_idx[sub_idx].startswith("NE") or labels_ner_idx[sub_idx] == "en","some invalid label %s found in Ann_NERSub.txt at utter %d" % (labels_idx[sub_idx],ids[idx])
            print "confidence here"
            if diff >= confidence_threshold:
                final_class = '/'.join([key for key in class_points.keys() if class_points[key] == first_max])
                if len(final_class.split('/'))>1:
                    final_class = final_class.split('/')[0]
            elif self.confidence_flag:
                final_class = 'en'
        else:
            print "no confidence here"
            self.confidence_flag = 1
            final_class = '/'.join([key for key in class_points.keys() if class_points[key] == first_max])
            if len(final_class.split('/'))>1:
                final_class = final_class.split('/')[0]
        class_verification_files(QUERY, final_class)
        print "Final class selected for this token is "+final_class
        return final_class


clf = UnsupervisedWikipediaClassifier()
id_list, annotation_list = get_data("%s/Annotation1.txt" % dir_path)
id_list, utterance_list = get_data("%s/Input1.txt" % dir_path)
final_annotation_list = ner_sub_category(id_list, annotation_list, utterance_list)
print "Storing parsed wiki content into .temp file"
clf.store_wiki_summary()
print "Storing done"
print "Preparing Submission format"
prepare_submission_nersub(id_list, final_annotation_list)
print "Submission format prepared and file saved in specified directory"
print "Running tests"
print run_tests(include_ner_subcat=True)
print annotation_count_test("./TestingData/Ann_NERSub.txt")
Beispiel #4
0
                    final_class = final_class.split('/')[0]
            elif self.confidence_flag:
                final_class = 'en'
        else:
            print "no confidence here"
            self.confidence_flag = 1
            final_class = '/'.join([
                key for key in class_points.keys()
                if class_points[key] == first_max
            ])
            if len(final_class.split('/')) > 1:
                final_class = final_class.split('/')[0]
        class_verification_files(QUERY, final_class)
        print "Final class selected for this token is " + final_class
        return final_class


clf = UnsupervisedWikipediaClassifier()
id_list, annotation_list = get_data("%s/Annotation1.txt" % dir_path)
id_list, utterance_list = get_data("%s/Input1.txt" % dir_path)
final_annotation_list = ner_sub_category(id_list, annotation_list,
                                         utterance_list)
print "Storing parsed wiki content into .temp file"
clf.store_wiki_summary()
print "Storing done"
print "Preparing Submission format"
prepare_submission_nersub(id_list, final_annotation_list)
print "Submission format prepared and file saved in specified directory"
print "Running tests"
print run_tests(include_ner_subcat=True)
print annotation_count_test("./TestingData/Ann_NERSub.txt")