예제 #1
0
def infer_prepare_params(basic_or_complex, fileToInfer):
    train_parser = MyParser("../train.wtag")
    seenWordsToTagsDict = train_parser.getSeenWordsToTagsDict()
    fb, filePrefix = None, None
    if basic_or_complex == 'basic':
        fb = BasicFeatureVectorBuilder(train_parser, 0)
        filePrefix = 'finish_basic_opt_v_'
    elif basic_or_complex == 'complex':
        fb = ComplexFeatureVectorBuilder(train_parser, False)
        filePrefix = 'finish_complex_opt_v_'
    else:
        assert (False)
    fn = str(fileToInfer).replace('.', '').replace('/', '')
    parser = MyParser(fileToInfer)
    splitted = parser.splitted
    mle = MLE(train_parser.getUniqueTags(), splitted, fb)

    prefixed = [
        filename for filename in os.listdir('.')
        if filename.startswith(filePrefix)
    ]
    prefixed.sort()
    print(prefixed)
    results = []

    for v_file in prefixed:
        v = np.loadtxt(v_file)
        vit = Viterbi(mle, mle.allTags, v, seenWordsToTagsDict)
        res_file = open(fn + "_results_" + v_file, 'w')
        exp_file = open(fn + "_expected_" + v_file, 'w')
        accuracy = infer_aux(exp_file, res_file, v_file, splitted, vit)
        res_file.close()
        exp_file.close()
        results = results + [accuracy]
    infer_aux_results(prefixed, results, fileToInfer, fn)
예제 #2
0
def realDataTest():

    parser = MyParser("../train.wtag")
    splitted = parser.splitted
    fb = BasicFeatureVectorBuilder(parser)
    tags = parser.getUniqueTags()
    start = time.time()
    mle = MLE(tags, splitted, fb)
    end = time.time()
    print("End of preprocessing, took: ", end - start)
    v = np.ones(fb.size)
    start = time.time()
    print(mle.calculate(v))
    end = time.time()
    print("calcV took: " + str((end - start) / 60))
    start = time.time()
    array = mle.calculateGradient(v)
    np.savetxt('train_gradient2.txt', array)
    end = time.time()
    print("calcGrad took: " + str((end - start) / 60))
    truth = np.loadtxt("train_gradient.txt")
    current = np.loadtxt("train_gradient2.txt")
    dist = np.linalg.norm(truth - current)
    print(dist)
    best_v = mle.findBestV()
    print(best_v)
예제 #3
0
def TRAIN():
    print("Training: ")
    parser = MyParser("../train.wtag")
    splitted = parser.splitted
    fb = BasicFeatureVectorBuilder(parser)
    tags = parser.getUniqueTags()
    mle = MLE(tags, splitted, fb)
    best_v = mle.findBestV(np.loadtxt("opt_v.txt"))
    print(best_v)
예제 #4
0
def fit_complex_model(continueTraining):
    v = None
    if continueTraining:
        v = np.loadtxt("finish_complex_opt_v_lambda_0_007.txt")
    lambdas = [0.007]
    parser = MyParser("../train.wtag")
    splitted = parser.splitted
    cfb = ComplexFeatureVectorBuilder(parser, False)
    tags = parser.getUniqueTags()
    mle = MLE(tags, splitted, cfb)
    fit_model_aux(mle, "complex", lambdas, 300, v)
예제 #5
0
def fit_basic_model(continueTraining):
    v = None
    if continueTraining:
        v = np.loadtxt("finish_basic_opt_v_lambda_0_007.txt")
    lambdas = [0.007]
    parser = MyParser("../train.wtag")
    splitted = parser.splitted
    basicFeatureBuilder = BasicFeatureVectorBuilder(parser, 0)
    tags = parser.getUniqueTags()
    mle = MLE(tags, splitted, basicFeatureBuilder)
    fit_model_aux(mle, "basic", lambdas, 550, v)
    def __init__(self, train_parser: MyParser, isTraining) -> None:
        self.parser = train_parser
        self.isTraining = isTraining
        vecSize = 0

        self.f100 = F100Builder(train_parser.getWordsWithTag(), vecSize)
        vecSize = self.f100.size
        print("F100 size", self.f100.size)

        self.f103 = F103Builder(train_parser.getAllThreeTagsCombinations(),
                                vecSize)
        vecSize = vecSize + self.f103.size
        print("F103 size", self.f103.size)

        self.f104 = F104Builder(train_parser.getAllPairTagsCombinations(),
                                vecSize)
        vecSize = vecSize + self.f104.size
        print("F104 size", self.f104.size)

        self.f106 = F106Builder(train_parser.getUniqueTags(), vecSize)
        vecSize = vecSize + self.f106.size
        print("F106 size", self.f106.size)

        self.fSuf = SuffixFeatureBuilder(train_parser, vecSize)
        vecSize = vecSize + self.fSuf.size
        print("Suffix size", self.fSuf.size)

        self.fPref = PrefixFeatureBuilder(train_parser, vecSize)
        vecSize = vecSize + self.fPref.size
        print("Prefix size", self.fPref.size)

        self.fDigNum = DigitNumberFeatureBuilder(train_parser, vecSize)
        vecSize = vecSize + self.fDigNum.size
        print("DigitNum size", self.fDigNum.size)

        self.fLetNum = DigitWordFeatureBuilder(train_parser, vecSize)
        vecSize = vecSize + self.fLetNum.size
        print("DigitLetter size", self.fLetNum.size)

        self.fCaps = CapsFeatureBuilder(train_parser, vecSize)
        vecSize = vecSize + self.fCaps.size
        print("Caps size", self.fCaps.size)

        self.fPrevNext = PrevNextWordFeatureBuilder(
            train_parser.getAllPrevWordTagCombinations(),
            train_parser.getAllNextWordTagCombinations(), vecSize)
        vecSize = vecSize + self.fPrevNext.size
        print("PrevNext size", self.fPrevNext.size)

        super().__init__(vecSize, 0)
예제 #7
0
def basicConfusion():
    mp = MyParser("../train.wtag")
    tags = mp.getUniqueTags()
    cm = ConfusionMatrix(tags)
    expected = open('testwtag_expected_finish_basic_opt_v_lambda_0_007.txt')
    actual = open('testwtag_results_finish_basic_opt_v_lambda_0_007.txt')
    mat, res = cm.calculateMatrixForLowestNTags(expected, actual, 10)
    expected.close()
    actual.close()
    output = open('basicConfusionMatrix_141217.txt', 'a')
    for tag in tags:
        output.write(" {}".format(tag))
    output.write('\n')
    for tag, idx in zip(res, range(0, len(res))):
        output.write("{} ".format(tag))
        for j in range(0, mat[idx].size):
            output.write("{} ".format(mat[idx][j]))
        output.write('\n')
예제 #8
0
def train():
    train_parser = MyParser("../train.wtag")
    seenSentencesToTagsDict = train_parser.getSeenWordsToTagsDict()
    parser = MyParser("../comp748.wtag")
    splitted = parser.splitted
    fb = BasicFeatureVectorBuilder(parser,0)
    mle = MLE(parser.getUniqueTags(), splitted, fb)
    v = np.loadtxt("opt_v_3.txt")
    sentences = list(map(lambda tuples: [t[0] for t in tuples], splitted))
    expected_tags = list(map(lambda tuples: [t[1] for t in tuples], splitted))
    seenSentencesToTagsDict = parser.getSeenWordsToTagsDict()
    vit = Viterbi(mle, mle.allTags, v, seenSentencesToTagsDict)
    total_res = 0
    words_count = 0
    total_time = 0
    for s,expected,idx in zip(sentences,expected_tags,range(0,len(splitted))):
        curr_word_len = len(s)
        words_count = words_count + curr_word_len
        start = time.time()
        tags = vit.inference(s)

        res_file = open("test_wtag748_results.txt",'a')
        for item in tags:
            res_file.write("%s " % item)
        res_file.write("\n")
        res_file.close()

        exp_file = open("test_wtag748_expected.txt", 'a')
        for item in expected:
            exp_file.write("%s " % item)
        exp_file.write("\n")
        exp_file.close()

        stop = time.time()
        e = np.array([hash(x) for x in expected])
        t = np.array([hash(x) for x in tags])
        current_correct = np.sum(e == t)
        print("---------------------")
        print("Inference for sentence# ", idx, " took: ", stop - start, " seconds")
        total_time = total_time + (stop-start)
        print("Current sentence accuracy: ", current_correct, " of: ", curr_word_len)
        total_res = total_res + current_correct
        print("Total sentence accuracy: ", total_res, " of: ", words_count, "=", (100*total_res)/words_count, "%")
        print("Total time for ", idx, " sentences: ", (total_time / 60), " minutes")
예제 #9
0
def calcTupleTestRealData():
    parser = MyParser("../train.wtag")
    splitted = parser.splitted
    # fb = BasicFeatureVectorBuilder(parser,0)
    fb = ComplexFeatureVectorBuilder(parser)
    tags = parser.getUniqueTags()
    start = time.time()
    mle = MLE(tags, splitted, fb, 0, "tmp1234.txt")
    end = time.time()
    print("End of preprocessing, took: ", end - start)
    v = np.ones(fb.size)
    start = time.time()
    f = open("train_gradientTuple.txt", "w")
    lv, grad = mle.calcTuple(v)
    print("L(V) = ", lv)
    print(grad)
    np.savetxt('train_gradientTuple.txt', grad)
    end = time.time()
    print("calcTuple took: ", end - start, " seconds")
예제 #10
0
from MyParser import MyParser

p = MyParser("../train.wtag")
words = p.getWordsWithTag()
tag3 = p.getAllThreeTagsCombinations()
tag2 = p.getAllPairTagsCombinations()
tag = p.getUniqueTags()
# print(tag3)
# print(tag2)
print(tag)