def vn_fix():
    vn_vocab = vnnorm_stats_unibi.load_vn_vocab()
    #fixing_map = vnnorm_stats.load_hard_fixing()
    hard_regex = HardRegex()
    hard_regex.load_from_file()
    one_fix = OneFixRegex()
    one_fix.load_from_file()
    multi_fix = MultiFixRegex()
    multi_fix.load_from_file()

    common_fixing = CommonRegex()
    common_fixing.load_from_file()


    from load_products import load_questions
    questions = load_questions()
    f_fix = open("q_fixing", "w", encoding="utf-8")
    question_norm1 = []
    for qs in questions:
        s = qs
        qs = unicode(qs)
        qs = qs.lower()
        qs = hard_regex.replace(qs)

        qs = common_fixing.replace(qs)
        qs = one_fix.replace(qs)
        qs = multi_fix.replace(qs)
        question_norm1.append(qs)
        f_fix.write(u"%s | %s\n" % (qs,s))
        #print s
        #print qs
        #exit(-1)
    f_fix.close()
Example #2
0
def stats(data="", path=""):
    vn_vocab,vn_bare_vocab,vn_long_vocab,vn_long_bare_vocab \
        =load_vn_vocab()
    fixing_map = load_hard_fixing()
    wrong_words_counters = dict()
    bigram_counters = dict()
    if data != "":
        questions = data
    elif path != "":
        from load_products import load_question_from_file
        load_question_from_file(path)
    else:
        from load_products import load_questions
        questions = load_questions()
    cdir = os.path.abspath(os.path.dirname(__file__))

    for qs in questions:
        #print qs
        qs = unicode(qs)
        qs = qs.lower()
        qs = norm_fix_common(qs, fixing_map)
        _tokens = split_sentece(qs)

        tokens = []
        for token in _tokens:
            token = utils.accent2bare(token)
            tokens.append(token)

        for i in xrange(len(tokens)):
            if not tokens[i] in vn_bare_vocab:
                utils.add_dict_counter(wrong_words_counters, tokens[i])
            if i < len(tokens) - 1:
                utils.add_dict_counter(bigram_counters,
                                       u"%s %s" % (tokens[i], tokens[i + 1]))

    sorted_wrong_tokens = utils.sort_dict(wrong_words_counters)
    sorted_bigram_counter = utils.sort_dict(bigram_counters)
    f_wrong = open("%s/models/data/out/wrong_tokens.dat" % cdir,
                   "w",
                   encoding="utf-8")
    f_bigram_stats = open("%s/models/data/out/bigram_tokens.dat" % cdir,
                          "w",
                          encoding="utf-8")

    for kv in sorted_wrong_tokens:
        ss = DIGIT.search(kv[0])
        if ss != None:
            continue
        f_wrong.write(u"%s : %s\n" % (kv[0], kv[1]))
    f_wrong.close()
    for kv in sorted_bigram_counter:
        f_bigram_stats.write(u"%s : %s\n" % (kv[0], kv[1]))
    f_bigram_stats.close()
def fix_question():
    bi_gram_fixing = BigramFixing()
    from load_products import load_questions
    questions = load_questions()

    f = open("stats/bare_question_fixing.dat","w",encoding="utf-8")
    cc = 0
    for qs in questions:
        # print qs
        cc += 1
        print "\r%s"%cc,
        if qs == None or qs == "":
            continue
        try:
            fixed_bare,fix_accent = bi_gram_fixing.fix(qs)
            f.write(u"%s | %s | %s\n"%(fixed_bare,qs,fix_accent))
        except:
            continue
    f.close()
    f.write("\n Done")
Example #4
0
def extract_wrong_words():
    vn_vocab = load_vn_vocab()
    fixing_map = load_hard_fixing()
    cc = []
    for c in cc:
        if not c in vn_vocab:
            print "Wrong",c
        #exit(-1)
    wrong_words_counters = dict()
    from load_products import load_questions
    questions = load_questions()
    for q in  questions:
        q = unicode(q).lower()
        tokens = split_sentece(q)
        for token in tokens:
            token = norm_token(token)
            try:
                token = fixing_map[token]
                continue
            except:
                pass
            if is_skip_token(token):
                continue

            else:
                if not token in vn_vocab:
                    try:
                        wrong_words_counters[token] += 1
                    except:
                        wrong_words_counters[token] = 1

    kvs = []
    for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)):
        kvs.append([key, value])

    TOP = 1000
    f = open("models/data/out/popular_wrong_words.dat","w",encoding="utf-8")
    for i in xrange(1,TOP):
        f.write(u"%s\n"%kvs[-i][0])
        print kvs[-i][0],kvs[-i][1]
    f.close()
Example #5
0
def export_bare_questions():
    from load_products import load_questions
    questions = load_questions()
    fixing_map = load_hard_fixing()

    f = open("out/bare_questions.dat", "w")

    for qs in questions:
        # print qs
        qs = unicode(qs)
        qs = qs.lower()
        qs = norm_fix_common(qs, fixing_map)
        _tokens = split_sentece(qs)

        tokens = []
        for token in _tokens:
            token = utils.accent2bare(token)
            tokens.append(token)
        sentence = " ".join(tokens)
        f.write(u"%s\n" % sentence)
    f.close()
def first_stats():
    tokenizer = Tokenizer()
    tokenizer.run()
    question_vocabulary = Vocabulary()

    questions = load_questions()
    cc = 0
    for question in questions:
        #print question
        if cc % 10 == 0:
            print "\r%s" % cc,
        cc += 1
        sen = tokenizer.predict(question)
        sen = sen.lower()
        tokens = question_vocabulary.get_sentence_token_ids(sen)
        question_list.append(tokens)
    print "\n Saving..."
    question_vocabulary.save(Q_VOCAB_NAME)
    utils.pickle_save(question_list, "question_tokens.dat")

    print "Done"
Example #7
0
def fix_wrong_words_heuristic(data="",path=""):
    vn_vocab = load_vn_vocab()
    fixing_map = load_hard_fixing()
    hard_regex = HardRegex()
    hard_regex.load_from_file()
    cc = []
    for c in cc:
        if not c in vn_vocab:
            print "Wrong",c
        #exit(-1)
    wrong_words_counters = dict()
    if data != "":
        questions = data
    elif path != "":
        from load_products import load_question_from_file
        questions = load_question_from_file(path)
    else:
        from load_products import load_questions
        questions = load_questions()
    f_fix = open("%s/models/data/out/fixing"%cdir,"w",encoding="utf-8")
    bi_forward = dict()
    bi_backward = dict()
    question_norm1 = []
    for qs in  questions:
        qs = unicode(qs)
        qs = qs.lower()
        qs = hard_regex.replace(qs)
        tokens = split_sentece(qs)
        qq = []
        ii = -1
        for token in tokens:
            ii += 1
            token = norm_token(token)
            try:
                token = fixing_map[token]
                qq.append(token)
                continue
            except:
                pass
            if is_skip_token(token):
                continue


            else:
                if not token in vn_vocab:
                    #if token == u"luc":
                    #    print "LUC here ",qs
                    try:
                        if ii > 0:
                            #if tokens[ii-1] == u"cường":
                            #    print "\t",token
                            try:
                                bi_backward[token][tokens[ii-1]] += 1
                            except:
                                try:
                                    mm = bi_backward[token]
                                except:
                                    mm = dict()
                                    bi_backward[token] = mm

                                try:
                                    mm[tokens[ii-1]] += 1
                                except:
                                    mm[tokens[ii-1]] = 1
                        if ii < len(tokens) - 1:
                            try:
                                mm = bi_forward[token]
                            except:
                                mm = dict()
                                bi_forward[token] = mm

                            try:
                                mm[tokens[ii + 1]] += 1
                            except:
                                mm[tokens[ii + 1]] = 1

                        wrong_words_counters[token] += 1
                    except:
                        wrong_words_counters[token] = 1
            qq.append(token)


        ss = " ".join(qq)

        question_norm1.append(qq)
        f_fix.write(u"%s\n"%ss)

    f_fix.close()
    kvs = []

    for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)):
        kvs.append([key, value])

    TOP = 400
    f = open("%s/models/data/out/popular_wrong_words.dat"%cdir,"w",encoding="utf-8")
    for i in xrange(1,TOP):
        f.write(u"%s\n"%kvs[-i][0])
        #print kvs[-ie WMT’14 English to French][0],kvs[-i][1]
    f.close()
    #TOP = 300
    candidates_f = dict()
    candidates_b = dict()


    revert_f = dict()
    revert_b = dict()
    T_TOP = 2
    T_MIN = 8

    f_forward_exist = dict()
    f_backward_exist = dict()
    for i in xrange(1,TOP):
        k = kvs[-i][0]
        #print kvs[-i][0],kvs[-i][1]

        forward_exist = True
        backward_exist = True
        try:

            f_forward = utils.sort_dict(bi_forward[k])
        except:
            forward_exist = False

        try:
            f_backward = utils.sort_dict(bi_backward[k])
        except:
            backward_exist = False

        f_forward_exist[k] = forward_exist
        f_backward_exist[k] = backward_exist

        if forward_exist:
            sz = min(T_TOP,len(f_forward))
            for i in xrange(sz):
                if f_forward[i][1] > T_MIN:
                    try:
                        #print f_forward[i][0]
                        revert_f[f_forward[i][0]].add(k)
                    except:
                        revert_f[f_forward[i][0]] = set()
                        revert_f[f_forward[i][0]].add(k)
        if backward_exist:
            sz = min(T_TOP,len(f_backward))
            for i in xrange(sz):
                if f_backward[i][1] > T_MIN:
                    try:
                        revert_b[f_backward[i][0]].add(k)
                    except:
                        revert_b[f_backward[i][0]] = set()
                        revert_b[f_backward[i][0]].add(k)

    #print revert_b
    #print revert_f

    b_stores = dict()
    f_stores = dict()


    for q in question_norm1:
        i = -1
        for token in q:
            i += 1
            if i < len(q) - 1:
                w_next = q[i+1]
                if w_next in vn_vocab:
                    try:
                        b_own = revert_b[token]
                        #Saving backward word context
                        try:
                            bb = b_stores[w_next]
                        except:
                            bb = dict()
                            b_stores[w_next] = bb
                        try:
                            bb[token] += 1
                        except:
                            bb[token] = 1

                        #Adding to the bw candidates

                        for w in b_own:
                            try:
                                d_cand = candidates_b[w]
                            except:
                                d_cand = dict()
                                candidates_b[w] = d_cand
                            try:
                                d_cand[w_next] += 1
                            except:
                                d_cand[w_next] = 1


                    except:
                        pass
            if i > 0:

                w_before = q[i - 1]

                if w_before in vn_vocab:
                    try:
                        b_own = revert_f[token]
                        try:
                            ff = f_stores[w_before]
                        except:
                            ff = dict()
                            f_stores[w_before] = ff

                        try:
                            ff[token] += 1
                        except:
                            ff[token] = 1

                        for w in b_own:
                            try:
                                d_cand = candidates_f[w]
                            except:
                                d_cand = dict()
                                candidates_f[w] = d_cand
                            try:
                                d_cand[w_before] += 1
                            except:
                                d_cand[w_before] = 1


                    except:
                        pass

    f = open("%s/models/data/out/fix_candidates"%cdir,"w",encoding="utf-8")
    one_fix = dict()
    f_one_fix = open("%s/models/data/out/one_fix.dat"%cdir,"w",encoding="utf-8")
    f_multi_fix = open("%s/models/data/out/multi_fix.dat"%cdir,"w",encoding="utf-8")
    N_MULTI = 2
    N_CONTEXT = 3
    THRES_2  = 0.7

    for k,v in b_stores.iteritems():
        v = utils.sort_dict(v)
        b_stores[k] = v
    for k,v in f_stores.iteritems():
        v = utils.sort_dict(v)
        f_stores[k] = v

    for k,v in candidates_b.iteritems():
        if f_backward_exist[k]:
            #print "Error_b: ",k

            ss = utils.sort_dict(v)
            #print "\t",ss

            ll = []
            l_candidates = []
            l_ref_scores = []
            for s in ss:
                ll.append(u"%s:%s " % (s[0], s[1]))
                l_candidates.append(s[0])
                l_ref_scores.append(s[1])

            ll = " ".join(ll)
            f.write(u"%s:\n" % k)
            f.write(u"\t%s\n" % ll)

            true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores)

            ll2 = []
            for i in xrange(len(true_candidates)):
                ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i]))
            f.write(u"\t%s\n"%" ".join(ll2))

            #Write one fix:
            if sorted_socre[1] < 1 and sorted_socre[0] > 1:
                one_fix[k] = true_candidates[0]
            elif sorted_socre[1] > THRES_2:
                for i in reversed(xrange(2)):
                    fix = true_candidates[i]

                    try:
                        ll_context = []
                        back_context = b_stores[fix]
                        for i in xrange(N_CONTEXT):
                            ll_context.append(back_context[i][0])
                        f_multi_fix.write("B\t%s\t%s\t%s\n"%(k,fix," ".join(ll_context)))
                    except:
                        pass

    f.write(u"\n\n\n")
    for k,v in candidates_f.iteritems():
        if f_forward_exist[k]:
            #print "Error_f: ",k
            ss = utils.sort_dict(v)
            #print "\t",ss

            ll = []
            l_candidates = []
            l_ref_scores = []
            for s in ss:
                ll.append(u"%s:%s " % (s[0], s[1]))
                l_candidates.append(s[0])
                l_ref_scores.append(s[1])
            ll = " ".join(ll)
            f.write(u"%s:\n" % k)
            f.write(u"\t%s\n" % ll)

            true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores)
            ll2 = []
            for i in xrange(len(true_candidates)):
                ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i]))
            f.write(u"\t%s\n"%" ".join(ll2))
            #one fix:
            if sorted_socre[1] < 1 and sorted_socre[0] > 1:
                one_fix[k] = true_candidates[0]
            elif sorted_socre[1] > THRES_2:
                for i in reversed(xrange(2)):
                    fix = true_candidates[i]

                    try:
                        ll_context = []
                        forward_context = f_stores[fix]
                        for i in xrange(N_CONTEXT):
                            ll_context.append(forward_context[i][0])
                        f_multi_fix.write("F\t%s\t%s\t%s\n" % (k, fix, " ".join(ll_context)))
                    except:
                        pass

    f.close()
    for k,v in one_fix.iteritems():
        f_one_fix.write("%s\t%s\n"%(k,v))
    f_one_fix.close()
    f_multi_fix.close()