Ejemplo n.º 1
0
def compute_features(d_dict, q_dict, c_dict):
    # in_q, in_c, lemma_in_q, lemma_in_c, tf
    q_words_set = set([w.lower() for w in q_dict['words']])
    in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']]
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']]

    q_words_set = set([w.lower() for w in q_dict['lemma']])
    lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']]
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']]

    tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words']]
    tf = [float('%.2f' % v) for v in tf]
    d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words']))
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words'])
    assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(lemma_in_c) == len(in_q) and len(tf) == len(in_q)
    assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)
    return {
        'in_q': in_q,
        'in_c': in_c,
        'lemma_in_q': lemma_in_q,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation
    }
Ejemplo n.º 2
0
 def get_relation(self, w1, w2):
     if is_stopword(w1) or is_stopword(w2):
         return '<NULL>'
     w1 = '_'.join(w1.lower().split())
     w2 = '_'.join(w2.lower().split())
     if not w1 in self.data:
         return '<NULL>'
     return self.data[w1].get(w2, '<NULL>')
Ejemplo n.º 3
0
def compute_features(d_dicts, q_dict, c_dicts, q_terms):
    # compute features for each d_dict and c_dict
    in_qs, in_cs, lemma_in_qs, lemma_in_cs = [], [], [], []
    p_q_relations, p_c_relations = [], []
    tfs = []

    for d_dict, c_dict in zip(d_dicts, c_dicts):
        # in_q, in_c, lemma_in_q, lemma_in_c, tf
        q_words_set = set([w.lower() for w in q_dict['words']])
        in_q = [
            int(w.lower() in q_words_set and not is_stopword(w))
            for w in d_dict['words']
        ]
        in_qs.append(in_q)

        c_words_set = set([w.lower() for w in c_dict['words']])
        in_c = [
            int(w.lower() in c_words_set and not is_stopword(w))
            for w in d_dict['words']
        ]
        in_cs.append(in_c)

        tf = [
            0.1 * math.log(word_count * word_frequency(w.lower(), 'zh') + 5)
            for w in d_dict['words']
        ]
        tf = [float('%.2f' % v) for v in tf]
        tfs.append(tf)
        # d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words']))

        from conceptnet import concept_net
        p_q_relation = concept_net.p_q_relation(d_dict['words'],
                                                q_dict['words'])
        p_q_relations.append(p_q_relation)
        p_c_relation = concept_net.p_q_relation(d_dict['words'],
                                                c_dict['words'])
        p_c_relations.append(p_c_relation)

        assert len(in_q) == len(in_c) and len(tf) == len(in_q)
        assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)

    q_es = [True if w in q_terms else False for w in q_dict['words']]

    # update in_c, lemma_in_c and p_c_relation
    return {
        'in_qs': in_qs,
        'in_cs': in_cs,
        'tfs': tfs,
        'p_q_relations': p_q_relations,
        'p_c_relations': p_c_relations,
        'q_es': q_es
    }
Ejemplo n.º 4
0
def compute_features(q_dict, c_dict):
    # in_c, lemma_in_c, tf
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in q_dict['words']
    ]

    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in q_dict['lemma']
    ]

    # tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']]
    tf = [wikiwords.freq(w.lower()) for w in q_dict['words']]
    # tf = [float('%.2f' % v) for v in tf]

    q_words = Counter(
        filter(lambda w: not is_stopword(w) and not is_punc(w),
               q_dict['words']))
    from conceptnet import concept_net
    q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words'])
    assert len(lemma_in_c) == len(in_c) and len(tf) == len(in_c)
    assert len(tf) == len(q_c_relation)

    q_is_science_term = [is_science_term(w) for w in q_dict['words']]
    q_is_cand = [
        1 if not is_punc(w) and not is_stopword(w) else 0
        for w in q_dict['words']
    ]

    return {
        'in_c': in_c,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'q_c_relation': q_c_relation,
        'q_is_science_term': q_is_science_term,
        'q_is_cand': q_is_cand
    }
Ejemplo n.º 5
0
 def p_q_relation(self, passage, query):
     passage = [w.lower() for w in passage]
     query = [w.lower() for w in query]
     query = set(query) | set([' '.join(query[i:(i+2)]) for i in range(len(query))])
     query = set([q for q in query if not is_stopword(q)])
     ret = ['<NULL>' for _ in passage]
     for i in range(len(passage)):
         for q in query:
             r = self.get_relation(passage[i], q)
             if r != '<NULL>':
                 ret[i] = r
                 break
             r = self.get_relation(' '.join(passage[i:(i+2)]), q)
             if r != '<NULL>':
                 ret[i] = r
                 break
     return ret
Ejemplo n.º 6
0
def compute_features(p_dict, q_dict, c_dict):
    # p_in_q, p_in_c, lemma_p_in_q, lemma_p_in_c, tf


    p_words_set = set([w.lower() for w in p_dict['words']])
    q_words_set = set([w.lower() for w in q_dict['words']])
    c_words_set = set([w.lower() for w in c_dict['words']])

    p_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']]
    p_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']]

    q_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']]
    q_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']]

    c_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']]
    c_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']]


    p_words_set = set([w.lower() for w in p_dict['lemma']])
    q_words_set = set([w.lower() for w in q_dict['lemma']])
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    p_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']]
    p_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']]
 
    q_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']]
    q_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']]

    c_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']]
    c_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']]

    p_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in p_dict['words']]
    p_tf = [float('%.2f' % v) for v in p_tf]
    q_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']]
    q_tf = [float('%.2f' % v) for v in q_tf]
    c_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in c_dict['words']]
    c_tf = [float('%.2f' % v) for v in c_tf]
    d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), p_dict['words']))
    
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(p_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(p_dict['words'], c_dict['words'])

    q_p_relation = concept_net.p_q_relation(q_dict['words'], p_dict['words'])
    q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words'])

    c_p_relation = concept_net.p_q_relation(c_dict['words'], p_dict['words'])
    c_q_relation = concept_net.p_q_relation(c_dict['words'], q_dict['words'])
 

    assert len(p_tf) == len(p_q_relation) and len(p_tf) == len(p_c_relation)
    assert len(q_tf) == len(q_p_relation) and len(q_tf) == len(q_c_relation)
    assert len(c_tf) == len(c_p_relation) and len(c_tf) == len(c_q_relation)


    return {
        'p_in_q': p_in_q,
        'p_in_c': p_in_c,
        'p_lemma_in_q': p_lemma_in_q,
        'p_lemma_in_c': p_lemma_in_c,
        'p_tf': p_tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation,

        'q_in_p': q_in_p,
        'q_in_c': q_in_c,
        'q_lemma_in_p': q_lemma_in_p,
        'q_lemma_in_c': q_lemma_in_c,
        'q_tf': q_tf,
        'q_p_relation': q_p_relation,
        'q_c_relation': q_c_relation,

        'c_in_p': c_in_p,
        'c_in_q': c_in_q,
        'c_lemma_in_p': c_lemma_in_p,
        'c_lemma_in_q': c_lemma_in_q,
        'c_tf': c_tf,

        'c_p_relation': c_p_relation,
        'c_q_relation': c_q_relation,

    }
Ejemplo n.º 7
0
def compute_features(d_dict, q_dict, c_dict, d_id, q_id, c_id, graphs,
                     sentence_graphs):
    # in_q, in_c, lemma_in_q, lemma_in_c, tf
    q_words_set = set([w.lower() for w in q_dict['words']])
    in_q = [
        int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['words']
    ]
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['words']
    ]

    q_words_set = set([w.lower() for w in q_dict['lemma']])
    lemma_in_q = [
        int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['lemma']
    ]
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['lemma']
    ]

    tf = [
        0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10)
        for w in d_dict['words']
    ]
    tf = [float('%.2f' % v) for v in tf]
    d_words = Counter(
        filter(lambda w: not is_stopword(w) and not is_punc(w),
               d_dict['words']))
    four_lang_utils = Utils()
    p_q_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    d_dict, q_dict)
    p_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    d_dict, c_dict)
    q_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    q_dict, c_dict)
    p_q_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id],
                                        sentence_graphs[d_id]["questions"][q_id], four_lang_utils)
    p_c_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id],
                                        sentence_graphs[d_id]["questions"][q_id]["choice"][c_id],
                                        four_lang_utils)
    q_c_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id]["questions"][q_id],
                                        sentence_graphs[d_id]["questions"][q_id]["choice"][c_id],
                                        four_lang_utils)
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words'])
    assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(
        lemma_in_c) == len(in_q) and len(tf) == len(in_q)
    assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)
    return {
        'in_q': in_q,
        'in_c': in_c,
        'lemma_in_q': lemma_in_q,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation,
        'p_q_four_lang_relation': p_q_four_lang_relation,
        'p_c_four_lang_relation': p_c_four_lang_relation,
        'q_c_four_lang_relation': q_c_four_lang_relation,
        'p_q_four_lang_sentence_relation': p_q_four_lang_sentence_relation,
        'p_c_four_lang_sentence_relation': p_c_four_lang_sentence_relation,
        'q_c_four_lang_sentence_relation': q_c_four_lang_sentence_relation
    }