def infer_prepare_params(basic_or_complex, fileToInfer): train_parser = MyParser("../train.wtag") seenWordsToTagsDict = train_parser.getSeenWordsToTagsDict() fb, filePrefix = None, None if basic_or_complex == 'basic': fb = BasicFeatureVectorBuilder(train_parser, 0) filePrefix = 'finish_basic_opt_v_' elif basic_or_complex == 'complex': fb = ComplexFeatureVectorBuilder(train_parser, False) filePrefix = 'finish_complex_opt_v_' else: assert (False) fn = str(fileToInfer).replace('.', '').replace('/', '') parser = MyParser(fileToInfer) splitted = parser.splitted mle = MLE(train_parser.getUniqueTags(), splitted, fb) prefixed = [ filename for filename in os.listdir('.') if filename.startswith(filePrefix) ] prefixed.sort() print(prefixed) results = [] for v_file in prefixed: v = np.loadtxt(v_file) vit = Viterbi(mle, mle.allTags, v, seenWordsToTagsDict) res_file = open(fn + "_results_" + v_file, 'w') exp_file = open(fn + "_expected_" + v_file, 'w') accuracy = infer_aux(exp_file, res_file, v_file, splitted, vit) res_file.close() exp_file.close() results = results + [accuracy] infer_aux_results(prefixed, results, fileToInfer, fn)
def realDataTest(): parser = MyParser("../train.wtag") splitted = parser.splitted fb = BasicFeatureVectorBuilder(parser) tags = parser.getUniqueTags() start = time.time() mle = MLE(tags, splitted, fb) end = time.time() print("End of preprocessing, took: ", end - start) v = np.ones(fb.size) start = time.time() print(mle.calculate(v)) end = time.time() print("calcV took: " + str((end - start) / 60)) start = time.time() array = mle.calculateGradient(v) np.savetxt('train_gradient2.txt', array) end = time.time() print("calcGrad took: " + str((end - start) / 60)) truth = np.loadtxt("train_gradient.txt") current = np.loadtxt("train_gradient2.txt") dist = np.linalg.norm(truth - current) print(dist) best_v = mle.findBestV() print(best_v)
def TRAIN(): print("Training: ") parser = MyParser("../train.wtag") splitted = parser.splitted fb = BasicFeatureVectorBuilder(parser) tags = parser.getUniqueTags() mle = MLE(tags, splitted, fb) best_v = mle.findBestV(np.loadtxt("opt_v.txt")) print(best_v)
def fit_complex_model(continueTraining): v = None if continueTraining: v = np.loadtxt("finish_complex_opt_v_lambda_0_007.txt") lambdas = [0.007] parser = MyParser("../train.wtag") splitted = parser.splitted cfb = ComplexFeatureVectorBuilder(parser, False) tags = parser.getUniqueTags() mle = MLE(tags, splitted, cfb) fit_model_aux(mle, "complex", lambdas, 300, v)
def fit_basic_model(continueTraining): v = None if continueTraining: v = np.loadtxt("finish_basic_opt_v_lambda_0_007.txt") lambdas = [0.007] parser = MyParser("../train.wtag") splitted = parser.splitted basicFeatureBuilder = BasicFeatureVectorBuilder(parser, 0) tags = parser.getUniqueTags() mle = MLE(tags, splitted, basicFeatureBuilder) fit_model_aux(mle, "basic", lambdas, 550, v)
def __init__(self, train_parser: MyParser, isTraining) -> None: self.parser = train_parser self.isTraining = isTraining vecSize = 0 self.f100 = F100Builder(train_parser.getWordsWithTag(), vecSize) vecSize = self.f100.size print("F100 size", self.f100.size) self.f103 = F103Builder(train_parser.getAllThreeTagsCombinations(), vecSize) vecSize = vecSize + self.f103.size print("F103 size", self.f103.size) self.f104 = F104Builder(train_parser.getAllPairTagsCombinations(), vecSize) vecSize = vecSize + self.f104.size print("F104 size", self.f104.size) self.f106 = F106Builder(train_parser.getUniqueTags(), vecSize) vecSize = vecSize + self.f106.size print("F106 size", self.f106.size) self.fSuf = SuffixFeatureBuilder(train_parser, vecSize) vecSize = vecSize + self.fSuf.size print("Suffix size", self.fSuf.size) self.fPref = PrefixFeatureBuilder(train_parser, vecSize) vecSize = vecSize + self.fPref.size print("Prefix size", self.fPref.size) self.fDigNum = DigitNumberFeatureBuilder(train_parser, vecSize) vecSize = vecSize + self.fDigNum.size print("DigitNum size", self.fDigNum.size) self.fLetNum = DigitWordFeatureBuilder(train_parser, vecSize) vecSize = vecSize + self.fLetNum.size print("DigitLetter size", self.fLetNum.size) self.fCaps = CapsFeatureBuilder(train_parser, vecSize) vecSize = vecSize + self.fCaps.size print("Caps size", self.fCaps.size) self.fPrevNext = PrevNextWordFeatureBuilder( train_parser.getAllPrevWordTagCombinations(), train_parser.getAllNextWordTagCombinations(), vecSize) vecSize = vecSize + self.fPrevNext.size print("PrevNext size", self.fPrevNext.size) super().__init__(vecSize, 0)
def basicConfusion(): mp = MyParser("../train.wtag") tags = mp.getUniqueTags() cm = ConfusionMatrix(tags) expected = open('testwtag_expected_finish_basic_opt_v_lambda_0_007.txt') actual = open('testwtag_results_finish_basic_opt_v_lambda_0_007.txt') mat, res = cm.calculateMatrixForLowestNTags(expected, actual, 10) expected.close() actual.close() output = open('basicConfusionMatrix_141217.txt', 'a') for tag in tags: output.write(" {}".format(tag)) output.write('\n') for tag, idx in zip(res, range(0, len(res))): output.write("{} ".format(tag)) for j in range(0, mat[idx].size): output.write("{} ".format(mat[idx][j])) output.write('\n')
def train(): train_parser = MyParser("../train.wtag") seenSentencesToTagsDict = train_parser.getSeenWordsToTagsDict() parser = MyParser("../comp748.wtag") splitted = parser.splitted fb = BasicFeatureVectorBuilder(parser,0) mle = MLE(parser.getUniqueTags(), splitted, fb) v = np.loadtxt("opt_v_3.txt") sentences = list(map(lambda tuples: [t[0] for t in tuples], splitted)) expected_tags = list(map(lambda tuples: [t[1] for t in tuples], splitted)) seenSentencesToTagsDict = parser.getSeenWordsToTagsDict() vit = Viterbi(mle, mle.allTags, v, seenSentencesToTagsDict) total_res = 0 words_count = 0 total_time = 0 for s,expected,idx in zip(sentences,expected_tags,range(0,len(splitted))): curr_word_len = len(s) words_count = words_count + curr_word_len start = time.time() tags = vit.inference(s) res_file = open("test_wtag748_results.txt",'a') for item in tags: res_file.write("%s " % item) res_file.write("\n") res_file.close() exp_file = open("test_wtag748_expected.txt", 'a') for item in expected: exp_file.write("%s " % item) exp_file.write("\n") exp_file.close() stop = time.time() e = np.array([hash(x) for x in expected]) t = np.array([hash(x) for x in tags]) current_correct = np.sum(e == t) print("---------------------") print("Inference for sentence# ", idx, " took: ", stop - start, " seconds") total_time = total_time + (stop-start) print("Current sentence accuracy: ", current_correct, " of: ", curr_word_len) total_res = total_res + current_correct print("Total sentence accuracy: ", total_res, " of: ", words_count, "=", (100*total_res)/words_count, "%") print("Total time for ", idx, " sentences: ", (total_time / 60), " minutes")
def calcTupleTestRealData(): parser = MyParser("../train.wtag") splitted = parser.splitted # fb = BasicFeatureVectorBuilder(parser,0) fb = ComplexFeatureVectorBuilder(parser) tags = parser.getUniqueTags() start = time.time() mle = MLE(tags, splitted, fb, 0, "tmp1234.txt") end = time.time() print("End of preprocessing, took: ", end - start) v = np.ones(fb.size) start = time.time() f = open("train_gradientTuple.txt", "w") lv, grad = mle.calcTuple(v) print("L(V) = ", lv) print(grad) np.savetxt('train_gradientTuple.txt', grad) end = time.time() print("calcTuple took: ", end - start, " seconds")
from MyParser import MyParser p = MyParser("../train.wtag") words = p.getWordsWithTag() tag3 = p.getAllThreeTagsCombinations() tag2 = p.getAllPairTagsCombinations() tag = p.getUniqueTags() # print(tag3) # print(tag2) print(tag)