Ejemplo n.º 1
0
def get_word_count_train_validation():
    d_word_count_t_q = util.get_d_word_count_train_question()
    d_word_count_t_c = util.get_d_word_count_train_choice()
    d_word_count_v_q = util.get_d_word_count_validation_question()
    d_word_count_v_c = util.get_d_word_count_validation_choice()
    d_word_count = {}
    for word in d_word_count_t_q.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_t_q[word]
    for word in d_word_count_t_c.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_t_c[word]
    for word in d_word_count_v_q.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_v_q[word]
    for word in d_word_count_v_c.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_v_c[word]
    return d_word_count
    '''
def statis_word2vec_coverage():
    '''
    How many words can be searched in the word2vec model ? 
    And which ones can or can not be found ?  
    This can guide us to use more and more data.
    '''
    path_model = 'model/word2vec_4.model'
    model = gensim.models.Word2Vec.load(path_model)
    d_word_count = util.get_d_word_count_train_choice()
    n_found = 0
    n_miss = 0
    for word in d_word_count.keys():
        try:
            res = model[word]
            n_found += 1
            print "%s\t%d\tFound" % (word, d_word_count[word])
        except:
            n_miss += 1
            print "%s\t%d\tMiss" % (word, d_word_count[word])
    print "Found\t%d\tMiss\t%d" % (n_found, n_miss)
def get_word_count_train_validation():
    d_word_count_t_q = util.get_d_word_count_train_question()
    d_word_count_t_c = util.get_d_word_count_train_choice()
    d_word_count_v_q = util.get_d_word_count_validation_question()
    d_word_count_v_c = util.get_d_word_count_validation_choice()
    d_word_count = {}
    for word in d_word_count_t_q.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_t_q[word]
    for word in d_word_count_t_c.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_t_c[word]
    for word in d_word_count_v_q.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_v_q[word]
    for word in d_word_count_v_c.keys():
        d_word_count.setdefault(word, 0)
        d_word_count[word] += d_word_count_v_c[word]
    return d_word_count
    '''
def statis_word2vec_coverage():
    '''
    How many words can be searched in the word2vec model ? 
    And which ones can or can not be found ?  
    This can guide us to use more and more data.
    '''
    path_model = 'model/word2vec_4.model'
    model = gensim.models.Word2Vec.load(path_model)
    d_word_count = util.get_d_word_count_train_choice()
    n_found = 0
    n_miss = 0
    for word in d_word_count.keys():
        try:
            res = model[word]
            n_found += 1
            print "%s\t%d\tFound" % (word, d_word_count[word])
        except:
            n_miss += 1
            print "%s\t%d\tMiss" % (word, d_word_count[word])
    print "Found\t%d\tMiss\t%d" % (n_found, n_miss)