Esempio n. 1
0
 def test_decompose(self) -> None:
     input = "생각해요"
     composer = input_parsing.HangeulComposer()
     output = composer.decompose(input)
     self.assertEquals("생각해요", output)
Esempio n. 2
0
 def test_process_output(self) -> None:
     input = "생각해요"
     composer = input_parsing.HangeulComposer()
     output = composer.process_output(input)
     self.assertEquals("생각해요", output)
Esempio n. 3
0
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "hvo:", ["outfile=", "visualize", "no_saveout"])
    except getopt.GetoptError:
        exit_with_usage()

    save_classifier = True
    create_visualization = False
    output_name = "classifier"
    for opt, arg in opts:
        if opt == "--no_saveout":
            save_classifier = False
        elif opt == "--visualize" or opt == "-v":
            create_visualization = True
        elif opt == "--outfile" or opt == "-o":
            output_name = arg
        elif opt == "-h":
            exit_with_usage()

    if len(args) == 0:
        exit_with_usage()

    input_name = args[0]

    print("Loading training word pairs...")
    input_processor = par.CombinedProcessor([par.StripProcessor(), par.HangeulComposer()])
    word_pairs = []
    with codecs.open(input_name, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            word_pairs.append(tuple(input_processor.process_input(part) for part in line.split(",")))
    print("... read {} word pairs".format(len(word_pairs)))

    print("Analyzing training word pairs...")
    training_set = [TrainingSetElement(pair[0], pair[1]) for pair in word_pairs]

    print("Clustering training word pairs by local transformations...")
    clusters = ClusterSet()
    for training_instance in training_set:
        clusters.add(training_instance)
    clusters = clusters.get_clusters()
    print("... split word pairs into {} clusters of similar transformations".format(len(clusters)))

    print("Extracting features for training...")
    x_data = []
    c_data = []
    for c, cluster in enumerate(clusters):
        for training_instance in cluster.items:
            features = dict()
            word = training_instance.word_a
            length = len(word)
            features["length"] = length
            for i in range(length):
                features[i] = word[i]
                features[i - length] = word[i]
            x_data.append(features)
            c_data.append(c)

    vectorizer = DictVectorizer()
    x_data = vectorizer.fit_transform(x_data)
    print("... extracted {} features for training the classifier".format(len(vectorizer.get_feature_names())))

    print("Training classifier....")
    classifier = DecisionTreeClassifier(criterion="entropy")
    classifier.fit(x_data, c_data)

    if save_classifier:
        # sklearn advises to use pickle to store classifiers: http://scikit-learn.org/stable/modules/model_persistence.html
        print("Storing classifier...")
        with open(output_name + ".clf", "wb") as output_file:
            pickle.dump(classifier, output_file)

    if create_visualization:
        print("Creating tree visualization...")
        visualize_tree(classifier, input_processor, vectorizer, clusters, output_name)

    print("done!")