def generate(config, argv):
    # load topic info
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    feature_file_path = '%s/topic_fs_length.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), 'all')
    feature_file = open(feature_file_path, 'w')
    features = [0] * len(tid_list)

    for line_id in range(len(tid_list)):
        feature = list()
        feature.append(len(father_list[line_id]))
        feature.append(len(tc_list[line_id]))
        feature.append(len(tw_list[line_id]))
        feature.append(len(dc_list[line_id]))
        feature.append(len(dw_list[line_id]))

        label_id = int(label2id[tid_list[line_id]])
        features[label_id] = feature

    feature_file.write('%d %d\n' % (len(features), len(features[0])))

    for feature in features:
        Feature.save_feature(feature, feature_file)

    feature_file.close()
コード例 #2
0
def save_question_topic_info(cf):
    q_train_set = cf.get('DEFAULT', 'source_pt') + '/question_train_set.txt'
    (qid_train_list, tc_train_list, tw_train_list, dc_train_list,
     dw_train_list) = data_utils.load_question_set(q_train_set)

    q_eval_set = cf.get('DEFAULT', 'source_pt') + '/question_eval_set.txt'
    (qid_eval_list, tc_eval_list, tw_eval_list, dc_eval_list,
     dw_eval_list) = data_utils.load_question_set(q_eval_set)

    q_topic_set = cf.get('DEFAULT', 'source_pt') + '/topic_info.txt'
    (tid_topic_list, father_topic_list, tc_topic_list, tw_topic_list,
     dc_topic_list, dw_topic_list) = data_utils.load_topic_info(q_topic_set)

    btm_qt_info_fp = cf.get('DEFAULT', 'devel_pt') + '/btm_qt_info.txt'
    f = open(btm_qt_info_fp, 'w')
    for i in range(len(qid_train_list)):
        s = ' '.join((tw_train_list[i] + dw_train_list[i])) + '\n'
        if 0 == s.strip():
            logging.warn('question_train_set.txt has no content at line#%d' %
                         i)
            s = 'empty\n'
        f.write(s)
    for i in range(len(qid_eval_list)):
        s = ' '.join((tw_eval_list[i] + dw_eval_list[i])) + '\n'
        if 0 == s.strip():
            logging.warn('question_eval_set.txt has no content at line#%d' % i)
            s = 'empty\n'
        f.write(s)
    for i in range(len(tid_topic_list)):
        s = ' '.join((tw_topic_list[i] + dw_topic_list[i])) + '\n'
        if 0 == s.strip():
            logging.warn('topic_info.txt has no content at line#%d' % i)
            s = 'empty\n'
        f.write(s)
    f.close()
def generate_word_share_features(config, argv):
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    save_word_share_features(config, 'offline', tw_list)
    save_word_share_features(config, 'online', tw_list)
コード例 #4
0
def all_length_analysis(config):
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    LogUtil.log('INFO', 'analysis length of title char:')
    length_analysis(tc_list)
    LogUtil.log('INFO', 'analysis length of title word:')
    length_analysis(tw_list)
    LogUtil.log('INFO', 'analysis length of document char:')
    length_analysis(dc_list)
    LogUtil.log('INFO', 'analysis length of document word:')
    length_analysis(dw_list)
def load_topic_btm_vec(config):
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'), config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    btm_topic_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), 'topic')
    btm_topic_vec_f = open(btm_topic_vec_fp, 'r')

    topic_btm_vecs = [0.] * 1999

    line_id = 0
    for line in btm_topic_vec_f:
        vec = np.nan_to_num(parse_feature_vec(line))
        topic_btm_vecs[int(label2id[tid_list[line_id]])] = vec
        line_id += 1

    return topic_btm_vecs
def load_topic_info_sort(config):
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    tc_sort = [[]] * 1999
    tw_sort = [[]] * 1999
    dc_sort = [[]] * 1999
    dw_sort = [[]] * 1999

    for line_id in range(1999):
        tid = int(label2id[tid_list[line_id]])
        tc_sort[tid] = tc_list[line_id]
        tw_sort[tid] = tw_list[line_id]
        dc_sort[tid] = dc_list[line_id]
        dw_sort[tid] = dw_list[line_id]

    return tc_sort, tw_sort, dc_sort, dw_sort
def generate(config, argv):
    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'r') as word_idf_f:
        word_idf = json.load(word_idf_f)
    LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf))

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'r') as char_idf_f:
        char_idf = json.load(char_idf_f)
    LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf))

    # load topic info
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    feature_file_path = '%s/topic_fs_idf_sum.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), 'all')
    feature_file = open(feature_file_path, 'w')
    features = [0] * len(tid_list)

    for line_id in range(len(tid_list)):
        feature = list()

        tc = tc_list[line_id]
        tw = tw_list[line_id]
        dc = dc_list[line_id]
        dw = dw_list[line_id]

        feature.append(
            sum([
                char_idf[char] for char in tc
                if len(char) > 0 and char in char_idf
            ]))
        feature.append(
            sum([
                word_idf[word] for word in tw
                if len(word) > 0 and word in word_idf
            ]))

        feature.append(
            sum([
                char_idf[char] for char in dc
                if len(char) > 0 and char in char_idf
            ]))
        feature.append(
            sum([
                word_idf[word] for word in dw
                if len(word) > 0 and word in word_idf
            ]))

        label_id = int(label2id[tid_list[line_id]])
        features[label_id] = feature

    feature_file.write('%d %d\n' % (len(features), len(features[0])))

    for feature in features:
        Feature.save_feature(feature, feature_file)

    feature_file.close()