Esempio n. 1
0
def wordCount():
    data = cjdpy.load_csv(train_file)
    X = [list(jieba.cut(item[0])) for item in data]
    vocab_list = [word for text in X for word in text]
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, vocab_file)
Esempio n. 2
0
def conCount():
    data = cjdpy.load_csv(train_file)
    vocab_list = [item[3].split('|||') for item in data if len(item) == 4]
    vocab_list = sum(vocab_list, [])
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, con_file)
Esempio n. 3
0
def get_vid_embedding():
    # 离线计算kandian_today中视频的embedding
    print("get vid embedding begin")
    kandian_today = video_meta_to_id("../data/kandian_today.json")
    pred_data = cjdpy.load_csv("../data/pred.txt")

    input_x, _ = get_predict_data_serving(
        pred_data, 1, kandian_today)  # use behavior seq 随意选择即可

    kandian = cjdpy.load_list("../data/kandian_today.json")
    kandian_str = []
    for line in kandian:
        line = json.loads(line)
        kandian_str.append(" ".join([
            line["first_category"], line["second_category"], line["tags"],
            line["media_name"]
        ]))

    idx_rate = []
    cmsid_embed = []
    cmsid_set = set()
    cmsid_list = []
    bad_case = 0
    for i in range(len(input_x["first_category"])):
        dict_data = {
            "instances": [{
                "first_category": input_x["first_category"][i],
                "second_category": input_x["second_category"][i],
                "tag": input_x["tag"][i],
                "media": input_x["media"][i],
                "rate_discretize": input_x["rate_discretize"][i],
                "position": input_x["position"][i]
            }]
        }

        try:
            resp = requests.post('http://localhost:8515/v1/models/ttm:predict',
                                 json=dict_data)
            res = json.loads(resp.text)
            idx_rate.append([i] + res["predictions"][0]["ie"])
            cmsid_val = json.loads(kandian[i])["cmsid"]
            if cmsid_val in cmsid_set: continue
            cmsid_embed.append(res["predictions"][0]["ie"])
            cmsid_set.add(cmsid_val)
            cmsid_list.append(cmsid_val)
        except:
            bad_case += 1
        # if "predictions" not in res:
        #     bad_case += 1
        #     continue

        if i % 5000 == 0:
            print("process", i)

    print("#fail to request tf serving", bad_case)

    cjdpy.save_csv(idx_rate, "ie.txt")
    cjdpy.save_csv(cmsid_embed, "cmsid_embedding.txt", " ")
    cjdpy.save_lst(cmsid_list, "cmsid.txt")
    print("get vid embedding done")
Esempio n. 4
0
def make_label(file):
    global id2label, label2id
    data = cjdpy.load_csv(file)
    label = [item[1] for item in data if len(item) == 4]
    id2label = list(set(label))
    label2id = {y: x for x, y in enumerate(id2label)}
    return id2label, label2id
Esempio n. 5
0
def visualize_dataset(file_name):
    _, id2first_category = load_vocab("../vocab/first_category_vocab.txt")
    _, id2second_category = load_vocab("../vocab/second_category_vocab.txt")
    _, id2media = load_vocab("../vocab/media_vocab.txt")
    _, id2tag = load_vocab("../vocab/tag_vocab.txt")

    data = cjdpy.load_csv(file_name)
    res = []
    for line in data[:5000]:
        first_category = [
            id2first_category[int(id)] for id in line[0].split(" ")
        ]
        second_category = [
            id2second_category[int(id)] for id in line[1].split(" ")
        ]
        media = [id2media[int(id)] for id in line[2].split(" ")]
        # tag_list = [tags for tags in line[3].split(" ")]
        tag = []
        for tag_list in line[3].split(" "):
            tag_list = [id2tag[int(id)] for id in tag_list.split("#")]
            tag.append("#".join(tag_list))
        res_one = []
        rate_discretize = line[4].split(" ")
        for i in range(len(first_category)):
            res_one.append("\t".join([
                first_category[i], second_category[i], media[i], tag[i],
                str(rate_discretize[i])
            ]))
        res.append("\n".join(res_one))
    return res
Esempio n. 6
0
def make_con_vocab():
    global id2con, con2id
    vocab_list = cjdpy.load_csv(con_file)
    cons = [vocab_list[i][0] for i in range(con_size)]
    cons.remove('PAD')
    id2con = ['<PAD>', '<UNK>'] + cons
    con2id = {y: x for x, y in enumerate(id2con)}
    return id2con, con2id
Esempio n. 7
0
def train_test_data():
    data = cjdpy.load_csv('data/topic_classification/text.con.txt')
    train_file = 'data/topic_classification/train.txt'
    test_file = 'data/topic_classification/test.txt'
    random.shuffle(data)
    line = 10000
    cjdpy.save_csv(data[:line], test_file)
    cjdpy.save_csv(data[line:], train_file)
Esempio n. 8
0
def load_data(file):
    data = cjdpy.load_csv(file)
    X, y = [], []
    for item in data:
        X.append(list(jieba.cut(item[0])))
        y.append(item[1])
    X = bag_of_words(X)
    return X, y
Esempio n. 9
0
def make_vocab():
    global id2w, w2id
    vocab_list = cjdpy.load_csv(vocab_file)
    id2w = [vocab_list[i][0] for i in range(vocab_size)]
    w2id = {word: i for i, word in enumerate(id2w)}
    # tip: set the vocabulary size according to different dataset
    id2w = ['<PAD>', '<UNK>'] + [x[0] for x in vocab_list[:vocab_size]]
    w2id = {y: x for x, y in enumerate(id2w)}
    return id2w, w2id
Esempio n. 10
0
def word_count():
    data = cjdpy.load_csv(train_file)
    X, y = [], []
    for item in data:
        X.append(list(jieba.cut(item[0])))
        # y.append(item[1])
    vocab_list = [word for text in X for word in text]
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, vocab_file)
    assert False
Esempio n. 11
0
def cal_diversity_metric():
    pred_data_vis = visualize_dataset("../data/pred.txt")
    pred_data = cjdpy.load_csv("../data/pred.txt")
    kandian_today_id = video_meta_to_id("../data/kandian_today.json")
    kandian_today = cjdpy.load_list("../data/kandian_today.json")

    ie = cjdpy.load_csv("ie.txt")

    test_sample_num = 10
    pred_idx = [i * 30 for i in range(test_sample_num)]
    fc_cnt, sc_cnt, media_cnt, tag_cnt = 0, 0, 0, 0
    total_eval_video = 0
    for num in pred_idx:
        print(pred_data_vis[num])
        fc_set, sc_set, media_set, tag_set = set(), set(), set(), set()
        for line in pred_data_vis[num].split("\n"):
            items = line.split("\t")
            fc_set.add(items[0])
            sc_set.add(items[1])
            media_set.add(items[2])
            tag_set = tag_set | set(items[3].split("#"))
        input_x, _ = get_predict_data_serving(pred_data, num, kandian_today_id)
        topK = faiss_retrieve(input_x, kandian_today, ie)
        # print(topK)
        total_eval_video += len(topK)
        for line in topK:
            items = line.split("\t")
            # print(items)
            if items[0] not in fc_set: fc_cnt += 1
            if items[1] not in sc_set: sc_cnt += 1
            if items[2] not in media_set: media_cnt += 1
            for tag in items[3].split("|"):
                if tag not in tag_set:
                    tag_cnt += 1
        print()
        # break
    print("first_category", test_sample_num * fc_cnt / total_eval_video)
    print("second_category", test_sample_num * sc_cnt / total_eval_video)
    print("media", test_sample_num * media_cnt / total_eval_video)
    print("tag", test_sample_num * tag_cnt / total_eval_video)
Esempio n. 12
0
def pre_trained_embedding():
    # embedding_matrix = np.random.normal(size=(vocab_size+2, 50))
    embedding_matrix = np.zeros((vocab_size + 2, 50))
    data = cjdpy.load_csv('data/vectors_word.txt', ' ')
    w2v = {
        item[0]: np.array(list(map(lambda x: float(x), item[1:])))
        for item in data[1:]
    }
    for i in range(vocab_size + 2):
        emb = w2v.get(id2w[i])
        if emb is not None: embedding_matrix[i, :] = emb
        else: print(id2w[i])
    return embedding_matrix
Esempio n. 13
0
def get_concept():
    data = cjdpy.load_csv('st_ent.txt')
    res = []
    for i, (text, lable, ents) in enumerate(data):
        if i % 500 == 0: print('processing %d...' % i)
        cons = []
        for item in ents.split('|||'):
            url = 'http://shuyantech.com/api/cnprobase/concept?q=%s&apikey=%s' % (
                item, 'ljqljqljq')
            response = json.loads(requests.get(url).text)
            cons += [item[0] for item in response.get('ret', [])]
        if len(cons) == 0: cons = ['PAD']
        res.append([text, lable, ents, '|||'.join(cons)])
    cjdpy.save_csv(res, 'text.con.txt')
Esempio n. 14
0
def get_predict_data(file_name):
    data = cjdpy.load_csv(file_name)

    print("len(data): ", len(data))

    first_category_feature = []
    second_category_feature = []
    tag_feature = []
    media_category_feature = []
    rate_discretize_feature = []
    position_feature = []
    y = []

    for line in data:
        first_category_feature.append(list(map(eval, line[0].split(" "))))
        second_category_feature.append(list(map(eval, line[1].split(" "))))
        media_category_feature.append(list(map(eval, line[2].split(" "))))
        vid_tag_list = []
        for vid_tag in line[3].split(" "):
            vid_tag_list.append(list(map(eval, vid_tag.split("#"))))
        tag_feature.append(vid_tag_list)
        rate_discretize_feature.append(list(map(eval, line[4].split(" "))))
        if rate_discretize_feature[-1][-1] == 4:
            y.append(1)
        else:
            y.append(0)
        position_feature.append(
            [i for i in range(len(first_category_feature[-1]))])

    first_category_feature = np.array(first_category_feature)
    second_category_feature = np.array(second_category_feature)
    tag_feature = np.array(tag_feature)
    media_category_feature = np.array(media_category_feature)
    rate_discretize_feature = np.array(rate_discretize_feature)
    position_feature = np.array(position_feature)
    y = np.array(y)

    # input_fn
    input_x = {
        "first_category": first_category_feature,
        "second_category": second_category_feature,
        "tag": tag_feature,
        "media": media_category_feature,
        "rate_discretize": rate_discretize_feature,
        "position": position_feature
    }
    return input_x, y
Esempio n. 15
0
def load_data(file):
    data = cjdpy.load_csv(file)
    X, y = [], []
    for text, label, ent, con in data:
        try:
            words = list(jieba.cut(text))
            cons = con.split('|||')
            X.append(
                Tokens2Intlist(w2id, words, maxSeqLen).tolist() +
                Tokens2Intlist(con2id, cons,
                               maxConLen).tolist())  # with prior concept
            y.append(label2id[label])
        except Exception as e:
            import traceback
            traceback.print_exc()

    # sample imbalance problem
    # class_weight = Counter([item[1] for item in data])
    # class_weight = {label2id[key]:val/len(y) for key, val in class_weight.items()}
    return X, y
Esempio n. 16
0
def entity_linking(path):
    data = cjdpy.load_csv(path)
    res = []
    fout = open('out1.txt', 'w', encoding='utf-8')
    for i, item in enumerate(data):
        if i <= 83429: continue
        if len(item) == 2:
            text, label = item[0], item[1]
        else:
            continue
        if i % 500 == 0: print('processing %d...' % i)
        url = 'http://shuyantech.com/api/entitylinking/cutsegment?q=%s&apikey=%s' % (
            text, 'ljqljqljq')
        try:
            response = json.loads(requests.get(url).text)
        except:
            print('entity linking fail: ', text)
        ents = [item[1] for item in response.get('entities', [])]
        if len(ents) == 0: ents = ['PAD']
        fout.write(text + '\t' + label + '\t' + '|||'.join(ents) + '\n')
        fout.flush()
Esempio n. 17
0
		 for token in X[i]:
			 if w2id.get(token):
				 X_vec[i, w2id[token]] += 1
	 return X_vec

def load_data(file):
    data = cjdpy.load_csv(file)
    X, y = [], []
    for item in data:
        X.append(list(jieba.cut(item[0])))
        y.append(item[1])
    X = bag_of_words(X)
    return X, y


vocab_list = cjdpy.load_csv(vocab_file)
id2w = [vocab_list[i][0] for i in range(vocab_size)]
w2id = {word: i for i, word in enumerate(id2w)}

X_train, y_train = load_data(train_file)
X_test, y_test = load_data(test_file)
print('training samples: ', len(X_train))
print('test samples: ', len(X_test))

clf = LinearSVC()
# clf = LogisticRegression()
clf.fit(X_train, y_train)


def evaluate(y_true, y_pred):
	 print(classification_report_imbalanced(y_true, y_pred))
Esempio n. 18
0
def cal_map_metric():
    data = cjdpy.load_csv("../data/metric_data.txt")

    FUTRUE_SEQ_LEN = 10
    HISTORY_SEQ_LEN = 25
    map_score, map_fc_score, map_sc_score, map_tag_score = 0, 0, 0, 0
    eval_sample_num = 100

    # uin, first_category, second_category, media, tags, rate, rate_discretize
    for case, line in enumerate(data):
        first_category = list(map(eval, line[1].split(" ")))
        second_category = list(map(eval, line[2].split(" ")))
        media = list(map(eval, line[3].split(" ")))
        rate = list(map(eval, line[5].split(" ")))
        rate_discretize = list(map(eval, line[6].split(" ")))
        vid_tag_list = []
        for vid_tag in line[4].split(" "):
            vid_tag_list.append(list(map(eval, vid_tag.split("#"))))
        if len(first_category) < 35:
            continue
        gt = [[i, rate[i + HISTORY_SEQ_LEN]] for i in range(FUTRUE_SEQ_LEN)]
        gt = sorted(gt, key=lambda x: x[1], reverse=True)
        gt_rank = [gt[i][0] for i in range(FUTRUE_SEQ_LEN)]

        gt_fc, gt_sc, gt_tag = {}, {}, {}
        for i in range(FUTRUE_SEQ_LEN):
            if first_category[i] not in gt_fc:
                gt_fc[first_category[i]] = 0
            gt_fc[first_category[i]] += rate[i + HISTORY_SEQ_LEN]
            if second_category[i] not in gt_sc:
                gt_sc[second_category[i]] = 0
            gt_sc[second_category[i]] += rate[i + HISTORY_SEQ_LEN]
            for tag in vid_tag_list[i]:
                if tag not in gt_tag:
                    gt_tag[tag] = 0
                gt_tag[tag] += rate[i + HISTORY_SEQ_LEN]

        gt_fc = sorted(gt_fc.items(), key=lambda x: x[1], reverse=True)
        gt_fc_rank = [gt_fc[i][0] for i in range(len(gt_fc))]
        gt_sc = sorted(gt_sc.items(), key=lambda x: x[1], reverse=True)
        gt_sc_rank = [gt_sc[i][0] for i in range(len(gt_sc))]
        gt_tag = sorted(gt_tag.items(), key=lambda x: x[1], reverse=True)
        gt_tag_rank = [gt_tag[i][0] for i in range(len(gt_tag))]
        # print(gt_fc_rank)

        # pd = []
        pred_rate = []
        for i in range(FUTRUE_SEQ_LEN):
            dict_data = {
                "instances": [{
                    "first_category": first_category[i:i + 26],
                    "second_category": second_category[i:i + 26],
                    "tag": vid_tag_list[i:i + 26],
                    "media": media[i:i + 26],
                    "rate_discretize": rate_discretize[i:i + 26],
                    "position": [i for i in range(26)]
                }]
            }

            resp = requests.post('http://localhost:8515/v1/models/ttm:predict',
                                 json=dict_data)
            res = json.loads(resp.text)
            # pd.append([i, res["predictions"][0]["y"]])
            pred_rate.append([i, res["predictions"][0]["y"]])
        pd = sorted(pred_rate, key=lambda x: x[1], reverse=True)
        pd_rank = [pd[i][0] for i in range(FUTRUE_SEQ_LEN)]
        map_score += cal_MAP(pd_rank, gt_rank)

        pd_fc, pd_sc, pd_tag = {}, {}, {}
        for i in range(FUTRUE_SEQ_LEN):
            if first_category[i] not in pd_fc:
                pd_fc[first_category[i]] = 0
            pd_fc[first_category[i]] += pred_rate[i][1]
            if second_category[i] not in pd_sc:
                pd_sc[second_category[i]] = 0
            pd_sc[second_category[i]] += pred_rate[i][1]
            for tag in vid_tag_list[i]:
                if tag not in pd_tag:
                    pd_tag[tag] = 0
                pd_tag[tag] += pred_rate[i][1]

        pd_fc = sorted(pd_fc.items(), key=lambda x: x[1], reverse=True)
        pd_fc_rank = [pd_fc[i][0] for i in range(len(pd_fc))]
        map_fc_score += cal_MAP(pd_fc_rank, gt_fc_rank)
        # print(pd_fc_rank)
        # break
        pd_sc = sorted(pd_sc.items(), key=lambda x: x[1], reverse=True)
        pd_sc_rank = [pd_sc[i][0] for i in range(len(pd_sc))]
        map_sc_score += cal_MAP(pd_sc_rank, gt_sc_rank)

        pd_tag = sorted(pd_tag.items(), key=lambda x: x[1], reverse=True)
        pd_tag_rank = [pd_tag[i][0] for i in range(len(pd_tag))]
        map_tag_score += cal_MAP(pd_tag_rank, gt_tag_rank)

        if case > eval_sample_num: break
    print("MAP for video score: ", map_score / eval_sample_num)
    print("MAP for video first category score: ",
          map_fc_score / eval_sample_num)
    print("MAP for video second category score: ",
          map_sc_score / eval_sample_num)
    print("MAP for video tag score: ", map_tag_score / eval_sample_num)
Esempio n. 19
0
def get_feature(sent):
    # get 1-gram and 2-gram feature from sentences
    chars = []
    for char in sent:
        if '\u4e00' <= char <= '\u9fff':  # u is necessary
            chars.append(char)
    ngram = []
    for j in range(len(chars)-1):
        ngram.append(chars[j])
        ngram.append(chars[j] + chars[j+1])
    fname = ' '.join(ngram)
    return fname

import time
start = time.time()
data = cjdpy.load_csv("data/original_data")
random.shuffle(data)
res = []
for i in range(len(data)):
    res.append([get_feature(data[i][1]), "__label__"+data[i][0]])


# data format
# 毛 毛毛 毛 毛虫 虫 虫的 的 的意 意 意见    __label__1
cjdpy.save_csv(res[:300000], "data/train.txt")
cjdpy.save_csv(res[300000:], "data/test.txt")
classifier = fasttext.train_supervised("data/train.txt")

def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
Esempio n. 20
0
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))

# parameters
embedding_dims = 300
batch_size = 32
epochs = 20
path_windows = r'\\10.141.208.22\data\Chinese_isA\corpus\wikicorpus_seg.txt'
path_linux = 'wikicorpus_seg.txt'

# dataset
texts, vocabulary = [], []

data = cjdpy.load_csv(path_windows)
# data = cjdpy.load_csv(path_linux)

for i, item in enumerate(data):
    if i > 10000: break
    vocabulary += list(item)
    texts.append(item)

vocabulary = set(vocabulary)
vocab_size = len(vocabulary)
word2id, id2word = {}, {}
for idx, word in enumerate(vocabulary):
    word2id[word] = idx
    id2word[idx] = word

text_seq = [word2id[word] for text in texts for word in text]
Esempio n. 21
0
def get_train_and_eval_data(file_name, save_flag=False):
    data = cjdpy.load_csv(file_name)
    print("origin data size: ", len(data))

    first_category_feature = []
    second_category_feature = []
    tag_feature = []
    media_category_feature = []
    rate_discretize_feature = []
    weight_feature = []
    position_feature = []
    y = []

    for line in data:
        pos_first_category = list(map(eval, line[0].split(" ")))
        pos_second_category = list(map(eval, line[1].split(" ")))
        pos_media = list(map(eval, line[2].split(" ")))
        pos_vid_tag_list = []
        for vid_tag in line[3].split(" "):
            pos_vid_tag_list.append(list(map(eval, vid_tag.split("#"))))
        pos_rate_discretize = list(map(eval, line[4].split(" ")))
        if pos_rate_discretize[-1] == 0:
            y.append(0)
            weight_feature.append(1 - pos_sample_weight)
        else:
            y.append(1)
            weight_feature.append(pos_sample_weight)

        first_category_feature.append(pos_first_category)
        second_category_feature.append(pos_second_category)
        media_category_feature.append(pos_media)
        tag_feature.append(pos_vid_tag_list)
        rate_discretize_feature.append(pos_rate_discretize)
        position_feature.append([i for i in range(len(pos_first_category))])

    del data

    def save_as_file(first_category_feature, second_category_feature,
                     tag_feature, media_category_feature):
        print("begin save file")
        data = []
        for i in range(len(first_category_feature)):
            item_str = " ".join(map(str, first_category_feature[i])) + "\t" + " ".join(map(str, second_category_feature[i])) + \
                        "\t" + " ".join(map(str, media_category_feature[i])) + "\t" + " ".join(["#".join(map(str, tag_list)) for tag_list in tag_feature[i]])
            data.append(item_str)
            print(item_str)
        cjdpy.save_lst(data, "new_dataset.txt")

    if save_flag:
        save_as_file(first_category_feature, second_category_feature,
                     tag_feature, media_category_feature)

    TRAIN_EVAL_THRESOLD = int(len(y) / 10 * 9)
    print("train and eval: ", TRAIN_EVAL_THRESOLD,
          len(y) - TRAIN_EVAL_THRESOLD)

    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(first_category_feature)
    first_category_feature = np.array(first_category_feature)
    first_category_feature_train, first_category_feature_eval = first_category_feature[:TRAIN_EVAL_THRESOLD], first_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del first_category_feature

    random.seed(randnum)
    random.shuffle(second_category_feature)
    second_category_feature = np.array(second_category_feature)
    second_category_feature_train, second_category_feature_eval = second_category_feature[:TRAIN_EVAL_THRESOLD], second_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del second_category_feature

    random.seed(randnum)
    random.shuffle(tag_feature)
    tag_feature = np.array(tag_feature)
    tag_feature_feature_train, tag_feature_feature_eval = tag_feature[:TRAIN_EVAL_THRESOLD], tag_feature[
        TRAIN_EVAL_THRESOLD:]
    del tag_feature

    random.seed(randnum)
    random.shuffle(media_category_feature)
    media_category_feature = np.array(media_category_feature)
    media_category_feature_train, media_category_feature_eval = media_category_feature[:TRAIN_EVAL_THRESOLD], media_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del media_category_feature

    random.seed(randnum)
    random.shuffle(rate_discretize_feature)
    rate_discretize_feature = np.array(rate_discretize_feature)
    rate_discretize_feature_train, rate_discretize_feature_eval = rate_discretize_feature[:TRAIN_EVAL_THRESOLD], rate_discretize_feature[
        TRAIN_EVAL_THRESOLD:]
    del rate_discretize_feature

    random.seed(randnum)
    random.shuffle(weight_feature)
    weight_feature = np.array(weight_feature, "float32")
    weight_feature_train, weight_feature_eval = weight_feature[:TRAIN_EVAL_THRESOLD], weight_feature[
        TRAIN_EVAL_THRESOLD:]
    del weight_feature

    position_feature = np.array(position_feature)
    position_feature_train, position_feature_eval = position_feature[:TRAIN_EVAL_THRESOLD], position_feature[
        TRAIN_EVAL_THRESOLD:]
    del position_feature

    random.seed(randnum)
    random.shuffle(y)
    y = np.array(y)
    y_train, y_eval = y[:TRAIN_EVAL_THRESOLD], y[TRAIN_EVAL_THRESOLD:]
    del y

    # input_fn
    train_input_x = {
        "first_category": first_category_feature_train,
        "second_category": second_category_feature_train,
        "tag": tag_feature_feature_train,
        "media": media_category_feature_train,
        "rate_discretize": rate_discretize_feature_train,
        "weight": weight_feature_train,
        "position": position_feature_train
    }
    eval_input_x = {
        "first_category": first_category_feature_eval,
        "second_category": second_category_feature_eval,
        "tag": tag_feature_feature_eval,
        "media": media_category_feature_eval,
        "rate_discretize": rate_discretize_feature_eval,
        "weight": weight_feature_eval,
        "position": position_feature_eval
    }

    return train_input_x, eval_input_x, y_train, y_eval
Esempio n. 22
0
def get_train_and_eval_data_method1(file_name, save_flag=False):
    # Target的一级类目不出现在用户行为序列视频的一级类目中
    video_dict = video_meta_to_id(video_meta_path)
    print("negetive samples size: ", len(video_dict["first_category"]),
          len(video_dict["second_category"]), len(video_dict["media"]),
          len(video_dict["tag"]))

    data = cjdpy.load_csv(file_name)
    print("origin data size: ", len(data))

    first_category_feature = []
    second_category_feature = []
    tag_feature = []
    media_category_feature = []
    rate_discretize_feature = []
    weight_feature = []
    position_feature = []
    y = []

    for line in data:
        pos_first_category = list(map(eval, line[0].split(" ")))
        pos_second_category = list(map(eval, line[1].split(" ")))
        pos_media = list(map(eval, line[2].split(" ")))
        pos_vid_tag_list = []
        for vid_tag in line[3].split(" "):
            pos_vid_tag_list.append(list(map(eval, vid_tag.split("#"))))
        pos_rate_discretize = list(map(eval, line[4].split(" ")))
        if pos_rate_discretize[-1] == 0: continue

        first_category_feature.append(pos_first_category)
        second_category_feature.append(pos_second_category)
        media_category_feature.append(pos_media)
        tag_feature.append(pos_vid_tag_list)
        rate_discretize_feature.append(pos_rate_discretize)
        y.append(1)
        weight_feature.append(pos_sample_weight)
        position_feature.append([i for i in range(len(pos_first_category))])

        pos_first_category_set = set(pos_first_category)
        neg_sample = 0
        while neg_sample < neg_sample_num:
            idx = random.randint(0, len(video_dict["media"]) - 1)
            if video_dict["first_category"][
                    idx] in pos_first_category_set:  # and neg_sample > neg_sample_num//2: # 可以有一些比较难区分的负样本
                continue
            neg_sample += 1

            neg_first_category = pos_first_category.copy()
            neg_first_category[-1] = video_dict["first_category"][idx]
            neg_second_category = pos_second_category.copy()
            neg_second_category[-1] = video_dict["second_category"][idx]
            neg_media = pos_media.copy()
            neg_media[-1] = video_dict["media"][idx]
            neg_vid_tag_list = pos_vid_tag_list.copy()
            neg_vid_tag_list[-1] = video_dict["tag"][idx]
            neg_rate_discretize = pos_rate_discretize.copy()
            neg_rate_discretize[-1] = 0

            first_category_feature.append(neg_first_category)
            second_category_feature.append(neg_second_category)
            media_category_feature.append(neg_media)
            tag_feature.append(neg_vid_tag_list)
            rate_discretize_feature.append(neg_rate_discretize)
            y.append(0)
            weight_feature.append(1 - pos_sample_weight)
            position_feature.append(
                [i for i in range(len(pos_first_category))])

    del video_dict, data

    def save_as_file(first_category_feature, second_category_feature,
                     tag_feature, media_category_feature):
        print("begin save file")
        data = []
        for i in range(len(first_category_feature)):
            item_str = " ".join(map(str, first_category_feature[i])) + "\t" + " ".join(map(str, second_category_feature[i])) + \
                        "\t" + " ".join(map(str, media_category_feature[i])) + "\t" + " ".join(["#".join(map(str, tag_list)) for tag_list in tag_feature[i]])
            data.append(item_str)
            print(item_str)
        cjdpy.save_lst(data, "new_dataset.txt")

    if save_flag:
        save_as_file(first_category_feature, second_category_feature,
                     tag_feature, media_category_feature)

    TRAIN_EVAL_THRESOLD = int(len(y) / 10 * 9)
    print("train and eval: ", TRAIN_EVAL_THRESOLD,
          len(y) - TRAIN_EVAL_THRESOLD)

    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(first_category_feature)
    first_category_feature = np.array(first_category_feature)
    first_category_feature_train, first_category_feature_eval = first_category_feature[:TRAIN_EVAL_THRESOLD], first_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del first_category_feature

    random.seed(randnum)
    random.shuffle(second_category_feature)
    second_category_feature = np.array(second_category_feature)
    second_category_feature_train, second_category_feature_eval = second_category_feature[:TRAIN_EVAL_THRESOLD], second_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del second_category_feature

    random.seed(randnum)
    random.shuffle(tag_feature)
    tag_feature = np.array(tag_feature)
    tag_feature_feature_train, tag_feature_feature_eval = tag_feature[:TRAIN_EVAL_THRESOLD], tag_feature[
        TRAIN_EVAL_THRESOLD:]
    del tag_feature

    random.seed(randnum)
    random.shuffle(media_category_feature)
    media_category_feature = np.array(media_category_feature)
    media_category_feature_train, media_category_feature_eval = media_category_feature[:TRAIN_EVAL_THRESOLD], media_category_feature[
        TRAIN_EVAL_THRESOLD:]
    del media_category_feature

    random.seed(randnum)
    random.shuffle(rate_discretize_feature)
    rate_discretize_feature = np.array(rate_discretize_feature)
    rate_discretize_feature_train, rate_discretize_feature_eval = rate_discretize_feature[:TRAIN_EVAL_THRESOLD], rate_discretize_feature[
        TRAIN_EVAL_THRESOLD:]
    del rate_discretize_feature

    random.seed(randnum)
    random.shuffle(weight_feature)
    weight_feature = np.array(weight_feature, "float32")
    weight_feature_train, weight_feature_eval = weight_feature[:TRAIN_EVAL_THRESOLD], weight_feature[
        TRAIN_EVAL_THRESOLD:]
    del weight_feature

    position_feature = np.array(position_feature)
    position_feature_train, position_feature_eval = position_feature[:TRAIN_EVAL_THRESOLD], position_feature[
        TRAIN_EVAL_THRESOLD:]
    del position_feature

    random.seed(randnum)
    random.shuffle(y)
    y = np.array(y)
    y_train, y_eval = y[:TRAIN_EVAL_THRESOLD], y[TRAIN_EVAL_THRESOLD:]
    del y

    # input_fn
    train_input_x = {
        "first_category": first_category_feature_train,
        "second_category": second_category_feature_train,
        "tag": tag_feature_feature_train,
        "media": media_category_feature_train,
        "rate_discretize": rate_discretize_feature_train,
        "weight": weight_feature_train,
        "position": position_feature_train
    }
    eval_input_x = {
        "first_category": first_category_feature_eval,
        "second_category": second_category_feature_eval,
        "tag": tag_feature_feature_eval,
        "media": media_category_feature_eval,
        "rate_discretize": rate_discretize_feature_eval,
        "weight": weight_feature_eval,
        "position": position_feature_eval
    }

    return train_input_x, eval_input_x, y_train, y_eval
Esempio n. 23
0
def load_vocab(file_name):
    data = cjdpy.load_csv(file_name)
    w2id = {line[0]: int(line[1]) for line in data if len(line) == 2}
    id2w = {int(line[1]): line[0] for line in data if len(line) == 2}
    return w2id, id2w
Esempio n. 24
0
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))

# parameters
embedding_dims = 200
batch_size = 32
epochs = 20
path_windows = r'\\10.141.208.22\data\Chinese_isA\corpus\wikicorpus_seg.txt'
path_linux = 'wikicorpus_seg.txt'

# dataset
texts, vocabulary = [], []

# data = cjdpy.load_csv(path_windows)
data = cjdpy.load_csv(path_linux)

for i, item in enumerate(data):
    if i > 50000: break
    vocabulary += list(item)
    texts.append(item)

vocabulary = set(vocabulary)
vocab_size = len(vocabulary)
print('vocabulary size: ', vocab_size)

word2id, id2word = {}, {}
for idx, word in enumerate(vocabulary):
    word2id[word] = idx
    id2word[idx] = word