def compute_features(d_dict, q_dict, c_dict): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words']] tf = [float('%.2f' % v) for v in tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation }
def compute_features(q_dict, c_dict): # in_c, lemma_in_c, tf c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma'] ] # tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] tf = [wikiwords.freq(w.lower()) for w in q_dict['words']] # tf = [float('%.2f' % v) for v in tf] q_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), q_dict['words'])) from conceptnet import concept_net q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) assert len(lemma_in_c) == len(in_c) and len(tf) == len(in_c) assert len(tf) == len(q_c_relation) q_is_science_term = [is_science_term(w) for w in q_dict['words']] q_is_cand = [ 1 if not is_punc(w) and not is_stopword(w) else 0 for w in q_dict['words'] ] return { 'in_c': in_c, 'lemma_in_c': lemma_in_c, 'tf': tf, 'q_c_relation': q_c_relation, 'q_is_science_term': q_is_science_term, 'q_is_cand': q_is_cand }
def compute_features(p_dict, q_dict, c_dict): # p_in_q, p_in_c, lemma_p_in_q, lemma_p_in_c, tf p_words_set = set([w.lower() for w in p_dict['words']]) q_words_set = set([w.lower() for w in q_dict['words']]) c_words_set = set([w.lower() for w in c_dict['words']]) p_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] p_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] q_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] q_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] c_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] c_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] p_words_set = set([w.lower() for w in p_dict['lemma']]) q_words_set = set([w.lower() for w in q_dict['lemma']]) c_words_set = set([w.lower() for w in c_dict['lemma']]) p_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] p_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] q_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] q_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] c_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] c_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] p_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in p_dict['words']] p_tf = [float('%.2f' % v) for v in p_tf] q_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] q_tf = [float('%.2f' % v) for v in q_tf] c_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in c_dict['words']] c_tf = [float('%.2f' % v) for v in c_tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), p_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(p_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(p_dict['words'], c_dict['words']) q_p_relation = concept_net.p_q_relation(q_dict['words'], p_dict['words']) q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) c_p_relation = concept_net.p_q_relation(c_dict['words'], p_dict['words']) c_q_relation = concept_net.p_q_relation(c_dict['words'], q_dict['words']) assert len(p_tf) == len(p_q_relation) and len(p_tf) == len(p_c_relation) assert len(q_tf) == len(q_p_relation) and len(q_tf) == len(q_c_relation) assert len(c_tf) == len(c_p_relation) and len(c_tf) == len(c_q_relation) return { 'p_in_q': p_in_q, 'p_in_c': p_in_c, 'p_lemma_in_q': p_lemma_in_q, 'p_lemma_in_c': p_lemma_in_c, 'p_tf': p_tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'q_in_p': q_in_p, 'q_in_c': q_in_c, 'q_lemma_in_p': q_lemma_in_p, 'q_lemma_in_c': q_lemma_in_c, 'q_tf': q_tf, 'q_p_relation': q_p_relation, 'q_c_relation': q_c_relation, 'c_in_p': c_in_p, 'c_in_q': c_in_q, 'c_lemma_in_p': c_lemma_in_p, 'c_lemma_in_q': c_lemma_in_q, 'c_tf': c_tf, 'c_p_relation': c_p_relation, 'c_q_relation': c_q_relation, }
def compute_features(d_dicts, q_dict, c_dicts, q_terms): # compute features for each d_dict and c_dict in_qs, in_cs, lemma_in_qs, lemma_in_cs = [], [], [], [] p_q_relations, p_c_relations = [], [] tfs = [] for d_dict, c_dict in zip(d_dicts, c_dicts): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] in_qs.append(in_q) q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] lemma_in_qs.append(lemma_in_q) c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] in_cs.append(in_c) c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] lemma_in_cs.append(lemma_in_c) tf = [ 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] tfs.append(tf) #d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_q_relations.append(p_q_relation) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) p_c_relations.append(p_c_relation) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len( lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) if q_terms is not None: q_es = [True if w in q_terms else False for w in q_dict['words']] else: q_es = None # update in_c, lemma_in_c and p_c_relation return { 'in_qs': in_qs, 'in_cs': in_cs, 'lemma_in_qs': lemma_in_qs, 'lemma_in_cs': lemma_in_cs, 'tfs': tfs, 'p_q_relations': p_q_relations, 'p_c_relations': p_c_relations, 'q_es': q_es }
def compute_features(d_dict, q_dict, c_dict, d_id, q_id, c_id, graphs, sentence_graphs): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] tf = [ 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] d_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) four_lang_utils = Utils() p_q_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, q_dict) p_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, c_dict) q_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, q_dict, c_dict) p_q_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id], four_lang_utils) p_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) q_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id]["questions"][q_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len( lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'p_q_four_lang_relation': p_q_four_lang_relation, 'p_c_four_lang_relation': p_c_four_lang_relation, 'q_c_four_lang_relation': q_c_four_lang_relation, 'p_q_four_lang_sentence_relation': p_q_four_lang_sentence_relation, 'p_c_four_lang_sentence_relation': p_c_four_lang_sentence_relation, 'q_c_four_lang_sentence_relation': q_c_four_lang_sentence_relation }