예제 #1
0
def load_dataset_from_file_loop(config, data_name, word_emb_index, char_emb_index, inds, loop=True):
    version = config.get('TITLE_CONTENT_CNN', 'version')
    LogUtil.log('INFO', 'version=%s' % version)
    data_loader = __import__('bin.text_cnn.%s.data_loader' % version, fromlist=["*"])
    part_size = config.getint('TITLE_CONTENT_CNN', 'part_size')

    inds_len = len(inds)
    inds_index = 0

    sub_inds = list()

    while True:

        if inds_len <= inds_index:
            if loop:
                inds_index = 0
                random.shuffle(inds)
            else:
                break

        sub_inds.append(inds[inds_index])
        inds_index += 1

        if (part_size == len(sub_inds)) or (inds_len <= inds_index):
            # delete duplicate
            sub_inds = reduce(lambda x, y: x if y in x else x + [y], [[], ] + sub_inds)
            yield data_loader.load_dataset_from_file(config, data_name, word_emb_index, char_emb_index, sub_inds)
            sub_inds = list()
예제 #2
0
def load_feature_vec_part(file_path, inds_copy, inds_map):
    vecs = [0] * len(inds_copy)

    index_f = 0
    index_inds = 0

    is_smat = isfile('%s.smat' % file_path)

    if is_smat:
        LogUtil.log('INFO', 'load sparse feature file %s' % file_path)
        f = open('%s.smat' % file_path, 'r')
        row_num, col_num = re.split(' |,', f.readline().strip('\n'))
        row_num = int(row_num)
        col_num = int(col_num)
    else:
        LogUtil.log('INFO', 'load dense feature file %s' % file_path)
        f = open(file_path, 'r')
        row_num = col_num = -1

    for line in f:
        if len(inds_copy) <= index_inds:
            break
        if index_f == inds_copy[index_inds]:
            vecs[inds_map[index_inds]] = parse_feature_vec(line) if not is_smat else parse_feature_sparse_vec(line, col_num)
            index_inds += 1
        index_f += 1
    f.close()

    return vecs
def F(preds, labels):
    topk = 5

    right_label_num = 0
    right_label_at_pos_num = [0] * 5
    sample_num = 0
    all_marked_label_num = 0

    for i, ps in enumerate(preds):
        sample_num += 1
        top5_ids = [x[0] for x in heapq.nlargest(5, enumerate(ps), key=lambda p: p[1])]

        label_ids = list()
        for kv in enumerate(labels[i]):
            if 1 == kv[1]:
                label_ids.append(kv[0])

        marked_label_set = set(label_ids)
        all_marked_label_num += len(marked_label_set)

        for pos, label in enumerate(top5_ids):
            if label in marked_label_set:
                right_label_num += 1
                right_label_at_pos_num[pos] += 1

    precision = 0.0

    for pos, right_num in zip(range(0, topk), right_label_at_pos_num):
        precision += (right_num / float(sample_num)) / math.log(2.0 + pos)
    recall = float(right_label_num) / all_marked_label_num

    LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision),
                                                           str(recall),
                                                           str((precision * recall) / (precision + recall))))
예제 #4
0
def init_text_cnn(config):
    # set number of cores
    mode = config.get('ENVIRONMENT', 'mode')
    LogUtil.log('INFO', 'mode=%s' % mode)
    if 'cpu' == mode:
        num_cores = config.getint('ENVIRONMENT', 'num_cores')
        tf_config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
                                   inter_op_parallelism_threads=num_cores,
                                   allow_soft_placement=True,
                                   device_count={'CPU': num_cores})
        session = tf.Session(config=tf_config)
        K.set_session(session)
    elif 'gpu' == mode:
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        sess = tf.Session(config=tf_config)
        K.set_session(sess)

    # load word embedding file
    word_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'word_embedding_fn'))
    word_embedding_index, word_embedding_matrix = load_embedding(
        word_embedding_fp)
    # load char embedding file
    char_embedding_fp = '%s/%s' % (config.get(
        'DIRECTORY',
        'embedding_pt'), config.get('TITLE_CONTENT_CNN', 'char_embedding_fn'))
    char_embedding_index, char_embedding_matrix = load_embedding(
        char_embedding_fp)
    # init model
    title_word_length = config.getint('TITLE_CONTENT_CNN', 'title_word_length')
    content_word_length = config.getint('TITLE_CONTENT_CNN',
                                        'content_word_length')
    title_char_length = config.getint('TITLE_CONTENT_CNN', 'title_char_length')
    content_char_length = config.getint('TITLE_CONTENT_CNN',
                                        'content_char_length')
    fs_btm_tw_cw_length = config.getint('TITLE_CONTENT_CNN',
                                        'fs_btm_tw_cw_length')
    fs_btm_tc_length = config.getint('TITLE_CONTENT_CNN', 'fs_btm_tc_length')
    class_num = config.getint('TITLE_CONTENT_CNN', 'class_num')
    optimizer_name = config.get('TITLE_CONTENT_CNN', 'optimizer_name')
    lr = float(config.get('TITLE_CONTENT_CNN', 'lr'))
    metrics = config.get('TITLE_CONTENT_CNN', 'metrics').split()
    model = TitleContentCNN(title_word_length=title_word_length,
                            content_word_length=content_word_length,
                            title_char_length=title_char_length,
                            content_char_length=content_char_length,
                            fs_btm_tw_cw_length=fs_btm_tw_cw_length,
                            fs_btm_tc_length=fs_btm_tc_length,
                            class_num=class_num,
                            word_embedding_matrix=word_embedding_matrix,
                            char_embedding_matrix=char_embedding_matrix,
                            optimizer_name=optimizer_name,
                            lr=lr,
                            metrics=metrics)

    return model, word_embedding_index, char_embedding_index
예제 #5
0
 def load(self, model_fp):
     # load json and create model
     json_file = open('%s.json' % model_fp, 'r')
     model_json = json_file.read()
     json_file.close()
     self._model = model_from_json(model_json)
     # load weights into new model
     self._model.load_weights('%s.h5' % model_fp)
     LogUtil.log('INFO', 'load model (%s) from disk done' % model_fp)
예제 #6
0
def load_feature_vec(file_path):
    if isfile(file_path + '.smat'):
        LogUtil.log('INFO', 'load sparse feature file %s' % file_path)
        f = open(file_path + '.smat', 'r')
        row_num, col_num = re.split(' |,', f.readline().strip('\n'))
        return [parse_feature_sparse_vec(line, int(col_num)) for line in f.readlines()]
    else:
        LogUtil.log('INFO', 'load dense feature file %s' % file_path)
        return [parse_feature_vec(line) for line in open(file_path).readlines()]
예제 #7
0
def load_raw_line_from_file(config, file_path, inds):
    # make a copy of index
    inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1])
    inds_copy = [kv[1] for kv in inds_sorted]
    inds_map = [kv[0] for kv in inds_sorted]

    sub_lines = load_raw_line_part(file_path, inds_copy, inds_map)

    LogUtil.log('INFO', 'load raw line done')

    return sub_lines
예제 #8
0
def load_features_from_file(config, feature_name, data_name, inds):
    # make a copy of index
    inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1])
    inds_copy = [kv[1] for kv in inds_sorted]
    inds_map = [kv[0] for kv in inds_sorted]

    # load features
    feature_fp = '%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), feature_name, data_name)

    sub_features = load_feature_vec_part(feature_fp, inds_copy, inds_map)
    LogUtil.log('INFO', 'len(sub_features)=%d' % len(sub_features))
    sub_features = np.asarray(sub_features, dtype='float32')
    LogUtil.log('INFO', 'load features done')

    return sub_features
예제 #9
0
def load_labels_from_file(config, data_name, inds):
    # make a copy of index
    inds_sorted = sorted(enumerate(inds), key=lambda kv: kv[1])
    inds_copy = [kv[1] for kv in inds_sorted]
    inds_map = [kv[0] for kv in inds_sorted]

    # load label id vectors
    lid_fp = None if 'online' == data_name \
        else '%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), 'label_id', data_name)

    class_num = config.getint('TITLE_CONTENT_CNN', 'class_num')

    sub_lid_vecs = None if lid_fp is None else np.asarray(load_lid_part(lid_fp, class_num, inds_copy, inds_map), dtype='int32')
    LogUtil.log('INFO', 'load label id vector done')

    return sub_lid_vecs
예제 #10
0
def load_embedding_with_idx(file_path, emb_index):
    emb_f = open(file_path, 'r')

    shape = emb_f.readline().strip()
    emb_num, emb_size = [int(x) for x in shape.split()]
    LogUtil.log('INFO', 'embedding_shape=(%d, %d)' % (emb_num, emb_size))

    emb_matrix = np.zeros([emb_num+2, emb_size])

    for line in emb_f:
        subs = line.strip().split()
        word = subs[0]
        vec = subs[1:]
        if word in emb_index:
            emb_matrix[emb_index[word]] = np.asarray(vec)

    return emb_matrix
예제 #11
0
def load_embedding(file_path):
    emb_f = open(file_path, 'r')

    shape = emb_f.readline().strip()
    emb_num, emb_size = [int(x) for x in shape.split()]
    LogUtil.log('INFO', 'embedding_shape=(%d, %d)' % (emb_num, emb_size))

    emb_index = {}
    emb_matrix = [['0.'] * emb_size, ['0.'] * emb_size]

    for line in emb_f:
        subs = line.strip().split()
        word = subs[0]
        vec = subs[1:]
        emb_index[word] = len(emb_matrix)
        emb_matrix.append(vec)
    emb_matrix = np.asarray(emb_matrix, dtype='float32')

    return emb_index, emb_matrix
def F_by_fuck_zhihu_eng(preds, labels):
    topk = 5
    top5_labels = list()

    for i, ps in enumerate(preds):
        top5 = enumerate(ps)
        top5 = sorted(top5, key=lambda s:s[1], reverse=True)
        top5_ids = [x[0] for x in top5[:5]]

        label_ids = list()
        for kv in enumerate(labels[i]):
            if 1 == kv[1]:
                label_ids.append(kv[0])

        top5_labels.append([top5_ids, label_ids])

    right_label_num = 0
    right_label_at_pos_num = [0 for i in range(50)]
    sample_num = 0
    all_marked_label_num = 0

    for predict_labels, marked_labels in top5_labels:
        sample_num += 1
        marked_label_set = set(marked_labels)
        all_marked_label_num += len(marked_label_set)

        for pos, label in zip(range(0, min(len(predict_labels), topk)), predict_labels):
            if label in marked_label_set:
                right_label_num += 1
                right_label_at_pos_num[pos] += 1

    precision = 0.0

    for pos, right_num in zip(range(0, topk), right_label_at_pos_num):
        precision += (right_num / float(sample_num)) / math.log(2.0 + pos)
    recall = float(right_label_num) / all_marked_label_num

    LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision),
                                                           str(recall),
                                                           str((precision * recall) / (precision + recall))))
def F_by_ids(ids, labels):
    topk = 5

    right_label_num = 0
    right_label_at_pos_num = [0] * 5
    sample_num = 0
    all_marked_label_num = 0

    for i, top5_ids in enumerate(ids):
        top5_ids = top5_ids[:5]
        sample_num += 1

        label_ids = list()
        for kv in enumerate(labels[i]):
            if 1 == kv[1]:
                label_ids.append(kv[0])

        marked_label_set = set(label_ids)
        all_marked_label_num += len(marked_label_set)

        for pos, label in enumerate(top5_ids):
            if label in marked_label_set:
                right_label_num += 1
                right_label_at_pos_num[pos] += 1

    precision = 0.0

    for pos, right_num in zip(range(0, topk), right_label_at_pos_num):
        precision += (right_num / float(sample_num)) / math.log(2.0 + pos)
    recall = float(right_label_num) / all_marked_label_num

    f = (precision * recall) / (precision + recall)

    LogUtil.log('INFO', 'precision=%s, recall=%s, f=%s' % (str(precision),
                                                           str(recall),
                                                           str(f)))
    return f
예제 #14
0
 def save(self, model_fp):
     model_json = self._model.to_json()
     with open('%s.json' % model_fp, 'w') as json_file:
         json_file.write(model_json)
     self._model.save_weights('%s.h5' % model_fp)
     LogUtil.log('INFO', 'save model (%s) to disk done' % model_fp)