Example #1
0
def init_text_cnn(config):
    # set number of cores
    mode = config.get('ENVIRONMENT', 'mode')
    LogUtil.log('INFO', 'mode=%s' % mode)
    if 'cpu' == mode:
        num_cores = config.getint('ENVIRONMENT', 'num_cores')
        tf_config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
                                   inter_op_parallelism_threads=num_cores,
                                   allow_soft_placement=True,
                                   device_count={'CPU': num_cores})
        session = tf.Session(config=tf_config)
        K.set_session(session)
    elif 'gpu' == mode:
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        sess = tf.Session(config=tf_config)
        K.set_session(sess)

    # load word embedding file
    word_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn'))
    word_embedding_index, word_embedding_matrix = load_embedding(
        word_embedding_fp)
    # load char embedding file
    char_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn'))
    char_embedding_index, char_embedding_matrix = load_embedding(
        char_embedding_fp)
    # init model
    title_word_length = config.getint('TITLE_CONTENT_CNN', 'title_word_length')
    content_word_length = config.getint('TITLE_CONTENT_CNN',
                                        'content_word_length')
    title_char_length = config.getint('TITLE_CONTENT_CNN', 'title_char_length')
    content_char_length = config.getint('TITLE_CONTENT_CNN',
                                        'content_char_length')
    fs_btm_tw_cw_length = config.getint('TITLE_CONTENT_CNN',
                                        'fs_btm_tw_cw_length')
    fs_btm_tc_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tc_length')
    class_num = config.getint('TITLE_CONTENT_CNN', 'class_num')
    optimizer_name = config.get('TITLE_CONTENT_CNN', 'optimizer_name')
    lr = float(config.get('TITLE_CONTENT_CNN', 'lr'))
    metrics = config.get('TITLE_CONTENT_CNN', 'metrics').split()
    model = TitleContentCNN(title_word_length=title_word_length,
                            content_word_length=content_word_length,
                            title_char_length=title_char_length,
                            content_char_length=content_char_length,
                            fs_btm_tw_cw_length=fs_btm_tw_cw_length,
                            fs_btm_tc_length=fs_btm_tc_length,
                            class_num=class_num,
                            word_embedding_matrix=word_embedding_matrix,
                            char_embedding_matrix=char_embedding_matrix,
                            optimizer_name=optimizer_name,
                            lr=lr,
                            metrics=metrics)

    return model, word_embedding_index, char_embedding_index
Example #2
0
def predict_val(config, part_id):
    version = config.get('TITLE_CONTENT_CNN', 'version')
    LogUtil.log('INFO', 'version=%s' % version)

    # load word embedding file
    word_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn'))
    word_embedding_index, _ = load_embedding(word_embedding_fp)
    # load char embedding file
    char_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn'))
    char_embedding_index, _ = load_embedding(char_embedding_fp)

    # init part_ids
    part_ids = generate_part_ids(config, part_id)

    # load offline valid dataset index
    valid_index_off_fp = '%s/%s.offline.index' % (config.get(
        'DIRECTORY',
        'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
    valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int')
    valid_index_off = [num - 1 for num in valid_index_off]

    for part_id in part_ids:
        LogUtil.log('INFO', 'part_id=%d' % part_id)

        # load model
        model_fp = config.get('DIRECTORY',
                              'model_pt') + 'text_cnn_%03d' % part_id
        # load json and create model
        json_file = open('%s.json' % model_fp, 'r')
        model_json = json_file.read()
        json_file.close()
        model = model_from_json(model_json, {'Scale': Scale})
        # load weights into new model
        model.load_weights('%s.h5' % model_fp)
        LogUtil.log('INFO', 'load model (%s) from disk done' % model_fp)

        # make predict and evaluation
        right_label_num = 0
        right_label_at_pos_num = [0] * 5
        sample_num = 0
        all_marked_label_num = 0
        precision = 0.0

        # save prediction
        pred_fp = '%s/vote_fs_text_cnn_%s_%d.offline' % (config.get(
            'DIRECTORY', 'pred_pt'), version, part_id)
        pred_all_f = open(pred_fp, 'w')

        for sub_valid_dataset in data_helpers.load_dataset_from_file_loop(
                config, 'offline', word_embedding_index, char_embedding_index,
                valid_index_off, False):
            sub_valid_preds = model.predict(sub_valid_dataset[:-1],
                                            batch_size=32,
                                            verbose=True)

            for i, ps in enumerate(sub_valid_preds):
                pred_all_f.write('%s\n' % ','.join([str(num) for num in ps]))
                sample_num += 1
                top5_ids = [
                    x[0] for x in heapq.nlargest(
                        5, enumerate(ps), key=lambda p: p[1])
                ]

                label_ids = list()
                for kv in enumerate(sub_valid_dataset[-1][i]):
                    if 1 == kv[1]:
                        label_ids.append(kv[0])

                marked_label_set = set(label_ids)
                all_marked_label_num += len(marked_label_set)

                for pos, label in enumerate(top5_ids):
                    if label in marked_label_set:
                        right_label_num += 1
                        right_label_at_pos_num[pos] += 1

        pred_all_f.close()

        for pos, right_num in zip(range(0, 5), right_label_at_pos_num):
            precision += (right_num / float(sample_num)) / math.log(2.0 + pos)
        recall = float(right_label_num) / all_marked_label_num

        LogUtil.log(
            'INFO', 'precision=%s, recall=%s, f=%s' %
            (str(precision), str(recall),
             str((precision * recall) / (precision + recall))))