def wordCount(): data = cjdpy.load_csv(train_file) X = [list(jieba.cut(item[0])) for item in data] vocab_list = [word for text in X for word in text] counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, vocab_file)
def conCount(): data = cjdpy.load_csv(train_file) vocab_list = [item[3].split('|||') for item in data if len(item) == 4] vocab_list = sum(vocab_list, []) counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, con_file)
def get_vid_embedding(): # 离线计算kandian_today中视频的embedding print("get vid embedding begin") kandian_today = video_meta_to_id("../data/kandian_today.json") pred_data = cjdpy.load_csv("../data/pred.txt") input_x, _ = get_predict_data_serving( pred_data, 1, kandian_today) # use behavior seq 随意选择即可 kandian = cjdpy.load_list("../data/kandian_today.json") kandian_str = [] for line in kandian: line = json.loads(line) kandian_str.append(" ".join([ line["first_category"], line["second_category"], line["tags"], line["media_name"] ])) idx_rate = [] cmsid_embed = [] cmsid_set = set() cmsid_list = [] bad_case = 0 for i in range(len(input_x["first_category"])): dict_data = { "instances": [{ "first_category": input_x["first_category"][i], "second_category": input_x["second_category"][i], "tag": input_x["tag"][i], "media": input_x["media"][i], "rate_discretize": input_x["rate_discretize"][i], "position": input_x["position"][i] }] } try: resp = requests.post('http://localhost:8515/v1/models/ttm:predict', json=dict_data) res = json.loads(resp.text) idx_rate.append([i] + res["predictions"][0]["ie"]) cmsid_val = json.loads(kandian[i])["cmsid"] if cmsid_val in cmsid_set: continue cmsid_embed.append(res["predictions"][0]["ie"]) cmsid_set.add(cmsid_val) cmsid_list.append(cmsid_val) except: bad_case += 1 # if "predictions" not in res: # bad_case += 1 # continue if i % 5000 == 0: print("process", i) print("#fail to request tf serving", bad_case) cjdpy.save_csv(idx_rate, "ie.txt") cjdpy.save_csv(cmsid_embed, "cmsid_embedding.txt", " ") cjdpy.save_lst(cmsid_list, "cmsid.txt") print("get vid embedding done")
def make_label(file): global id2label, label2id data = cjdpy.load_csv(file) label = [item[1] for item in data if len(item) == 4] id2label = list(set(label)) label2id = {y: x for x, y in enumerate(id2label)} return id2label, label2id
def visualize_dataset(file_name): _, id2first_category = load_vocab("../vocab/first_category_vocab.txt") _, id2second_category = load_vocab("../vocab/second_category_vocab.txt") _, id2media = load_vocab("../vocab/media_vocab.txt") _, id2tag = load_vocab("../vocab/tag_vocab.txt") data = cjdpy.load_csv(file_name) res = [] for line in data[:5000]: first_category = [ id2first_category[int(id)] for id in line[0].split(" ") ] second_category = [ id2second_category[int(id)] for id in line[1].split(" ") ] media = [id2media[int(id)] for id in line[2].split(" ")] # tag_list = [tags for tags in line[3].split(" ")] tag = [] for tag_list in line[3].split(" "): tag_list = [id2tag[int(id)] for id in tag_list.split("#")] tag.append("#".join(tag_list)) res_one = [] rate_discretize = line[4].split(" ") for i in range(len(first_category)): res_one.append("\t".join([ first_category[i], second_category[i], media[i], tag[i], str(rate_discretize[i]) ])) res.append("\n".join(res_one)) return res
def make_con_vocab(): global id2con, con2id vocab_list = cjdpy.load_csv(con_file) cons = [vocab_list[i][0] for i in range(con_size)] cons.remove('PAD') id2con = ['<PAD>', '<UNK>'] + cons con2id = {y: x for x, y in enumerate(id2con)} return id2con, con2id
def train_test_data(): data = cjdpy.load_csv('data/topic_classification/text.con.txt') train_file = 'data/topic_classification/train.txt' test_file = 'data/topic_classification/test.txt' random.shuffle(data) line = 10000 cjdpy.save_csv(data[:line], test_file) cjdpy.save_csv(data[line:], train_file)
def load_data(file): data = cjdpy.load_csv(file) X, y = [], [] for item in data: X.append(list(jieba.cut(item[0]))) y.append(item[1]) X = bag_of_words(X) return X, y
def make_vocab(): global id2w, w2id vocab_list = cjdpy.load_csv(vocab_file) id2w = [vocab_list[i][0] for i in range(vocab_size)] w2id = {word: i for i, word in enumerate(id2w)} # tip: set the vocabulary size according to different dataset id2w = ['<PAD>', '<UNK>'] + [x[0] for x in vocab_list[:vocab_size]] w2id = {y: x for x, y in enumerate(id2w)} return id2w, w2id
def word_count(): data = cjdpy.load_csv(train_file) X, y = [], [] for item in data: X.append(list(jieba.cut(item[0]))) # y.append(item[1]) vocab_list = [word for text in X for word in text] counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, vocab_file) assert False
def cal_diversity_metric(): pred_data_vis = visualize_dataset("../data/pred.txt") pred_data = cjdpy.load_csv("../data/pred.txt") kandian_today_id = video_meta_to_id("../data/kandian_today.json") kandian_today = cjdpy.load_list("../data/kandian_today.json") ie = cjdpy.load_csv("ie.txt") test_sample_num = 10 pred_idx = [i * 30 for i in range(test_sample_num)] fc_cnt, sc_cnt, media_cnt, tag_cnt = 0, 0, 0, 0 total_eval_video = 0 for num in pred_idx: print(pred_data_vis[num]) fc_set, sc_set, media_set, tag_set = set(), set(), set(), set() for line in pred_data_vis[num].split("\n"): items = line.split("\t") fc_set.add(items[0]) sc_set.add(items[1]) media_set.add(items[2]) tag_set = tag_set | set(items[3].split("#")) input_x, _ = get_predict_data_serving(pred_data, num, kandian_today_id) topK = faiss_retrieve(input_x, kandian_today, ie) # print(topK) total_eval_video += len(topK) for line in topK: items = line.split("\t") # print(items) if items[0] not in fc_set: fc_cnt += 1 if items[1] not in sc_set: sc_cnt += 1 if items[2] not in media_set: media_cnt += 1 for tag in items[3].split("|"): if tag not in tag_set: tag_cnt += 1 print() # break print("first_category", test_sample_num * fc_cnt / total_eval_video) print("second_category", test_sample_num * sc_cnt / total_eval_video) print("media", test_sample_num * media_cnt / total_eval_video) print("tag", test_sample_num * tag_cnt / total_eval_video)
def pre_trained_embedding(): # embedding_matrix = np.random.normal(size=(vocab_size+2, 50)) embedding_matrix = np.zeros((vocab_size + 2, 50)) data = cjdpy.load_csv('data/vectors_word.txt', ' ') w2v = { item[0]: np.array(list(map(lambda x: float(x), item[1:]))) for item in data[1:] } for i in range(vocab_size + 2): emb = w2v.get(id2w[i]) if emb is not None: embedding_matrix[i, :] = emb else: print(id2w[i]) return embedding_matrix
def get_concept(): data = cjdpy.load_csv('st_ent.txt') res = [] for i, (text, lable, ents) in enumerate(data): if i % 500 == 0: print('processing %d...' % i) cons = [] for item in ents.split('|||'): url = 'http://shuyantech.com/api/cnprobase/concept?q=%s&apikey=%s' % ( item, 'ljqljqljq') response = json.loads(requests.get(url).text) cons += [item[0] for item in response.get('ret', [])] if len(cons) == 0: cons = ['PAD'] res.append([text, lable, ents, '|||'.join(cons)]) cjdpy.save_csv(res, 'text.con.txt')
def get_predict_data(file_name): data = cjdpy.load_csv(file_name) print("len(data): ", len(data)) first_category_feature = [] second_category_feature = [] tag_feature = [] media_category_feature = [] rate_discretize_feature = [] position_feature = [] y = [] for line in data: first_category_feature.append(list(map(eval, line[0].split(" ")))) second_category_feature.append(list(map(eval, line[1].split(" ")))) media_category_feature.append(list(map(eval, line[2].split(" ")))) vid_tag_list = [] for vid_tag in line[3].split(" "): vid_tag_list.append(list(map(eval, vid_tag.split("#")))) tag_feature.append(vid_tag_list) rate_discretize_feature.append(list(map(eval, line[4].split(" ")))) if rate_discretize_feature[-1][-1] == 4: y.append(1) else: y.append(0) position_feature.append( [i for i in range(len(first_category_feature[-1]))]) first_category_feature = np.array(first_category_feature) second_category_feature = np.array(second_category_feature) tag_feature = np.array(tag_feature) media_category_feature = np.array(media_category_feature) rate_discretize_feature = np.array(rate_discretize_feature) position_feature = np.array(position_feature) y = np.array(y) # input_fn input_x = { "first_category": first_category_feature, "second_category": second_category_feature, "tag": tag_feature, "media": media_category_feature, "rate_discretize": rate_discretize_feature, "position": position_feature } return input_x, y
def load_data(file): data = cjdpy.load_csv(file) X, y = [], [] for text, label, ent, con in data: try: words = list(jieba.cut(text)) cons = con.split('|||') X.append( Tokens2Intlist(w2id, words, maxSeqLen).tolist() + Tokens2Intlist(con2id, cons, maxConLen).tolist()) # with prior concept y.append(label2id[label]) except Exception as e: import traceback traceback.print_exc() # sample imbalance problem # class_weight = Counter([item[1] for item in data]) # class_weight = {label2id[key]:val/len(y) for key, val in class_weight.items()} return X, y
def entity_linking(path): data = cjdpy.load_csv(path) res = [] fout = open('out1.txt', 'w', encoding='utf-8') for i, item in enumerate(data): if i <= 83429: continue if len(item) == 2: text, label = item[0], item[1] else: continue if i % 500 == 0: print('processing %d...' % i) url = 'http://shuyantech.com/api/entitylinking/cutsegment?q=%s&apikey=%s' % ( text, 'ljqljqljq') try: response = json.loads(requests.get(url).text) except: print('entity linking fail: ', text) ents = [item[1] for item in response.get('entities', [])] if len(ents) == 0: ents = ['PAD'] fout.write(text + '\t' + label + '\t' + '|||'.join(ents) + '\n') fout.flush()
for token in X[i]: if w2id.get(token): X_vec[i, w2id[token]] += 1 return X_vec def load_data(file): data = cjdpy.load_csv(file) X, y = [], [] for item in data: X.append(list(jieba.cut(item[0]))) y.append(item[1]) X = bag_of_words(X) return X, y vocab_list = cjdpy.load_csv(vocab_file) id2w = [vocab_list[i][0] for i in range(vocab_size)] w2id = {word: i for i, word in enumerate(id2w)} X_train, y_train = load_data(train_file) X_test, y_test = load_data(test_file) print('training samples: ', len(X_train)) print('test samples: ', len(X_test)) clf = LinearSVC() # clf = LogisticRegression() clf.fit(X_train, y_train) def evaluate(y_true, y_pred): print(classification_report_imbalanced(y_true, y_pred))
def cal_map_metric(): data = cjdpy.load_csv("../data/metric_data.txt") FUTRUE_SEQ_LEN = 10 HISTORY_SEQ_LEN = 25 map_score, map_fc_score, map_sc_score, map_tag_score = 0, 0, 0, 0 eval_sample_num = 100 # uin, first_category, second_category, media, tags, rate, rate_discretize for case, line in enumerate(data): first_category = list(map(eval, line[1].split(" "))) second_category = list(map(eval, line[2].split(" "))) media = list(map(eval, line[3].split(" "))) rate = list(map(eval, line[5].split(" "))) rate_discretize = list(map(eval, line[6].split(" "))) vid_tag_list = [] for vid_tag in line[4].split(" "): vid_tag_list.append(list(map(eval, vid_tag.split("#")))) if len(first_category) < 35: continue gt = [[i, rate[i + HISTORY_SEQ_LEN]] for i in range(FUTRUE_SEQ_LEN)] gt = sorted(gt, key=lambda x: x[1], reverse=True) gt_rank = [gt[i][0] for i in range(FUTRUE_SEQ_LEN)] gt_fc, gt_sc, gt_tag = {}, {}, {} for i in range(FUTRUE_SEQ_LEN): if first_category[i] not in gt_fc: gt_fc[first_category[i]] = 0 gt_fc[first_category[i]] += rate[i + HISTORY_SEQ_LEN] if second_category[i] not in gt_sc: gt_sc[second_category[i]] = 0 gt_sc[second_category[i]] += rate[i + HISTORY_SEQ_LEN] for tag in vid_tag_list[i]: if tag not in gt_tag: gt_tag[tag] = 0 gt_tag[tag] += rate[i + HISTORY_SEQ_LEN] gt_fc = sorted(gt_fc.items(), key=lambda x: x[1], reverse=True) gt_fc_rank = [gt_fc[i][0] for i in range(len(gt_fc))] gt_sc = sorted(gt_sc.items(), key=lambda x: x[1], reverse=True) gt_sc_rank = [gt_sc[i][0] for i in range(len(gt_sc))] gt_tag = sorted(gt_tag.items(), key=lambda x: x[1], reverse=True) gt_tag_rank = [gt_tag[i][0] for i in range(len(gt_tag))] # print(gt_fc_rank) # pd = [] pred_rate = [] for i in range(FUTRUE_SEQ_LEN): dict_data = { "instances": [{ "first_category": first_category[i:i + 26], "second_category": second_category[i:i + 26], "tag": vid_tag_list[i:i + 26], "media": media[i:i + 26], "rate_discretize": rate_discretize[i:i + 26], "position": [i for i in range(26)] }] } resp = requests.post('http://localhost:8515/v1/models/ttm:predict', json=dict_data) res = json.loads(resp.text) # pd.append([i, res["predictions"][0]["y"]]) pred_rate.append([i, res["predictions"][0]["y"]]) pd = sorted(pred_rate, key=lambda x: x[1], reverse=True) pd_rank = [pd[i][0] for i in range(FUTRUE_SEQ_LEN)] map_score += cal_MAP(pd_rank, gt_rank) pd_fc, pd_sc, pd_tag = {}, {}, {} for i in range(FUTRUE_SEQ_LEN): if first_category[i] not in pd_fc: pd_fc[first_category[i]] = 0 pd_fc[first_category[i]] += pred_rate[i][1] if second_category[i] not in pd_sc: pd_sc[second_category[i]] = 0 pd_sc[second_category[i]] += pred_rate[i][1] for tag in vid_tag_list[i]: if tag not in pd_tag: pd_tag[tag] = 0 pd_tag[tag] += pred_rate[i][1] pd_fc = sorted(pd_fc.items(), key=lambda x: x[1], reverse=True) pd_fc_rank = [pd_fc[i][0] for i in range(len(pd_fc))] map_fc_score += cal_MAP(pd_fc_rank, gt_fc_rank) # print(pd_fc_rank) # break pd_sc = sorted(pd_sc.items(), key=lambda x: x[1], reverse=True) pd_sc_rank = [pd_sc[i][0] for i in range(len(pd_sc))] map_sc_score += cal_MAP(pd_sc_rank, gt_sc_rank) pd_tag = sorted(pd_tag.items(), key=lambda x: x[1], reverse=True) pd_tag_rank = [pd_tag[i][0] for i in range(len(pd_tag))] map_tag_score += cal_MAP(pd_tag_rank, gt_tag_rank) if case > eval_sample_num: break print("MAP for video score: ", map_score / eval_sample_num) print("MAP for video first category score: ", map_fc_score / eval_sample_num) print("MAP for video second category score: ", map_sc_score / eval_sample_num) print("MAP for video tag score: ", map_tag_score / eval_sample_num)
def get_feature(sent): # get 1-gram and 2-gram feature from sentences chars = [] for char in sent: if '\u4e00' <= char <= '\u9fff': # u is necessary chars.append(char) ngram = [] for j in range(len(chars)-1): ngram.append(chars[j]) ngram.append(chars[j] + chars[j+1]) fname = ' '.join(ngram) return fname import time start = time.time() data = cjdpy.load_csv("data/original_data") random.shuffle(data) res = [] for i in range(len(data)): res.append([get_feature(data[i][1]), "__label__"+data[i][0]]) # data format # 毛 毛毛 毛 毛虫 虫 虫的 的 的意 意 意见 __label__1 cjdpy.save_csv(res[:300000], "data/train.txt") cjdpy.save_csv(res[300000:], "data/test.txt") classifier = fasttext.train_supervised("data/train.txt") def print_results(N, p, r): print("N\t" + str(N)) print("P@{}\t{:.3f}".format(1, p))
config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 set_session(tf.Session(config=config)) # parameters embedding_dims = 300 batch_size = 32 epochs = 20 path_windows = r'\\10.141.208.22\data\Chinese_isA\corpus\wikicorpus_seg.txt' path_linux = 'wikicorpus_seg.txt' # dataset texts, vocabulary = [], [] data = cjdpy.load_csv(path_windows) # data = cjdpy.load_csv(path_linux) for i, item in enumerate(data): if i > 10000: break vocabulary += list(item) texts.append(item) vocabulary = set(vocabulary) vocab_size = len(vocabulary) word2id, id2word = {}, {} for idx, word in enumerate(vocabulary): word2id[word] = idx id2word[idx] = word text_seq = [word2id[word] for text in texts for word in text]
def get_train_and_eval_data(file_name, save_flag=False): data = cjdpy.load_csv(file_name) print("origin data size: ", len(data)) first_category_feature = [] second_category_feature = [] tag_feature = [] media_category_feature = [] rate_discretize_feature = [] weight_feature = [] position_feature = [] y = [] for line in data: pos_first_category = list(map(eval, line[0].split(" "))) pos_second_category = list(map(eval, line[1].split(" "))) pos_media = list(map(eval, line[2].split(" "))) pos_vid_tag_list = [] for vid_tag in line[3].split(" "): pos_vid_tag_list.append(list(map(eval, vid_tag.split("#")))) pos_rate_discretize = list(map(eval, line[4].split(" "))) if pos_rate_discretize[-1] == 0: y.append(0) weight_feature.append(1 - pos_sample_weight) else: y.append(1) weight_feature.append(pos_sample_weight) first_category_feature.append(pos_first_category) second_category_feature.append(pos_second_category) media_category_feature.append(pos_media) tag_feature.append(pos_vid_tag_list) rate_discretize_feature.append(pos_rate_discretize) position_feature.append([i for i in range(len(pos_first_category))]) del data def save_as_file(first_category_feature, second_category_feature, tag_feature, media_category_feature): print("begin save file") data = [] for i in range(len(first_category_feature)): item_str = " ".join(map(str, first_category_feature[i])) + "\t" + " ".join(map(str, second_category_feature[i])) + \ "\t" + " ".join(map(str, media_category_feature[i])) + "\t" + " ".join(["#".join(map(str, tag_list)) for tag_list in tag_feature[i]]) data.append(item_str) print(item_str) cjdpy.save_lst(data, "new_dataset.txt") if save_flag: save_as_file(first_category_feature, second_category_feature, tag_feature, media_category_feature) TRAIN_EVAL_THRESOLD = int(len(y) / 10 * 9) print("train and eval: ", TRAIN_EVAL_THRESOLD, len(y) - TRAIN_EVAL_THRESOLD) randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(first_category_feature) first_category_feature = np.array(first_category_feature) first_category_feature_train, first_category_feature_eval = first_category_feature[:TRAIN_EVAL_THRESOLD], first_category_feature[ TRAIN_EVAL_THRESOLD:] del first_category_feature random.seed(randnum) random.shuffle(second_category_feature) second_category_feature = np.array(second_category_feature) second_category_feature_train, second_category_feature_eval = second_category_feature[:TRAIN_EVAL_THRESOLD], second_category_feature[ TRAIN_EVAL_THRESOLD:] del second_category_feature random.seed(randnum) random.shuffle(tag_feature) tag_feature = np.array(tag_feature) tag_feature_feature_train, tag_feature_feature_eval = tag_feature[:TRAIN_EVAL_THRESOLD], tag_feature[ TRAIN_EVAL_THRESOLD:] del tag_feature random.seed(randnum) random.shuffle(media_category_feature) media_category_feature = np.array(media_category_feature) media_category_feature_train, media_category_feature_eval = media_category_feature[:TRAIN_EVAL_THRESOLD], media_category_feature[ TRAIN_EVAL_THRESOLD:] del media_category_feature random.seed(randnum) random.shuffle(rate_discretize_feature) rate_discretize_feature = np.array(rate_discretize_feature) rate_discretize_feature_train, rate_discretize_feature_eval = rate_discretize_feature[:TRAIN_EVAL_THRESOLD], rate_discretize_feature[ TRAIN_EVAL_THRESOLD:] del rate_discretize_feature random.seed(randnum) random.shuffle(weight_feature) weight_feature = np.array(weight_feature, "float32") weight_feature_train, weight_feature_eval = weight_feature[:TRAIN_EVAL_THRESOLD], weight_feature[ TRAIN_EVAL_THRESOLD:] del weight_feature position_feature = np.array(position_feature) position_feature_train, position_feature_eval = position_feature[:TRAIN_EVAL_THRESOLD], position_feature[ TRAIN_EVAL_THRESOLD:] del position_feature random.seed(randnum) random.shuffle(y) y = np.array(y) y_train, y_eval = y[:TRAIN_EVAL_THRESOLD], y[TRAIN_EVAL_THRESOLD:] del y # input_fn train_input_x = { "first_category": first_category_feature_train, "second_category": second_category_feature_train, "tag": tag_feature_feature_train, "media": media_category_feature_train, "rate_discretize": rate_discretize_feature_train, "weight": weight_feature_train, "position": position_feature_train } eval_input_x = { "first_category": first_category_feature_eval, "second_category": second_category_feature_eval, "tag": tag_feature_feature_eval, "media": media_category_feature_eval, "rate_discretize": rate_discretize_feature_eval, "weight": weight_feature_eval, "position": position_feature_eval } return train_input_x, eval_input_x, y_train, y_eval
def get_train_and_eval_data_method1(file_name, save_flag=False): # Target的一级类目不出现在用户行为序列视频的一级类目中 video_dict = video_meta_to_id(video_meta_path) print("negetive samples size: ", len(video_dict["first_category"]), len(video_dict["second_category"]), len(video_dict["media"]), len(video_dict["tag"])) data = cjdpy.load_csv(file_name) print("origin data size: ", len(data)) first_category_feature = [] second_category_feature = [] tag_feature = [] media_category_feature = [] rate_discretize_feature = [] weight_feature = [] position_feature = [] y = [] for line in data: pos_first_category = list(map(eval, line[0].split(" "))) pos_second_category = list(map(eval, line[1].split(" "))) pos_media = list(map(eval, line[2].split(" "))) pos_vid_tag_list = [] for vid_tag in line[3].split(" "): pos_vid_tag_list.append(list(map(eval, vid_tag.split("#")))) pos_rate_discretize = list(map(eval, line[4].split(" "))) if pos_rate_discretize[-1] == 0: continue first_category_feature.append(pos_first_category) second_category_feature.append(pos_second_category) media_category_feature.append(pos_media) tag_feature.append(pos_vid_tag_list) rate_discretize_feature.append(pos_rate_discretize) y.append(1) weight_feature.append(pos_sample_weight) position_feature.append([i for i in range(len(pos_first_category))]) pos_first_category_set = set(pos_first_category) neg_sample = 0 while neg_sample < neg_sample_num: idx = random.randint(0, len(video_dict["media"]) - 1) if video_dict["first_category"][ idx] in pos_first_category_set: # and neg_sample > neg_sample_num//2: # 可以有一些比较难区分的负样本 continue neg_sample += 1 neg_first_category = pos_first_category.copy() neg_first_category[-1] = video_dict["first_category"][idx] neg_second_category = pos_second_category.copy() neg_second_category[-1] = video_dict["second_category"][idx] neg_media = pos_media.copy() neg_media[-1] = video_dict["media"][idx] neg_vid_tag_list = pos_vid_tag_list.copy() neg_vid_tag_list[-1] = video_dict["tag"][idx] neg_rate_discretize = pos_rate_discretize.copy() neg_rate_discretize[-1] = 0 first_category_feature.append(neg_first_category) second_category_feature.append(neg_second_category) media_category_feature.append(neg_media) tag_feature.append(neg_vid_tag_list) rate_discretize_feature.append(neg_rate_discretize) y.append(0) weight_feature.append(1 - pos_sample_weight) position_feature.append( [i for i in range(len(pos_first_category))]) del video_dict, data def save_as_file(first_category_feature, second_category_feature, tag_feature, media_category_feature): print("begin save file") data = [] for i in range(len(first_category_feature)): item_str = " ".join(map(str, first_category_feature[i])) + "\t" + " ".join(map(str, second_category_feature[i])) + \ "\t" + " ".join(map(str, media_category_feature[i])) + "\t" + " ".join(["#".join(map(str, tag_list)) for tag_list in tag_feature[i]]) data.append(item_str) print(item_str) cjdpy.save_lst(data, "new_dataset.txt") if save_flag: save_as_file(first_category_feature, second_category_feature, tag_feature, media_category_feature) TRAIN_EVAL_THRESOLD = int(len(y) / 10 * 9) print("train and eval: ", TRAIN_EVAL_THRESOLD, len(y) - TRAIN_EVAL_THRESOLD) randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(first_category_feature) first_category_feature = np.array(first_category_feature) first_category_feature_train, first_category_feature_eval = first_category_feature[:TRAIN_EVAL_THRESOLD], first_category_feature[ TRAIN_EVAL_THRESOLD:] del first_category_feature random.seed(randnum) random.shuffle(second_category_feature) second_category_feature = np.array(second_category_feature) second_category_feature_train, second_category_feature_eval = second_category_feature[:TRAIN_EVAL_THRESOLD], second_category_feature[ TRAIN_EVAL_THRESOLD:] del second_category_feature random.seed(randnum) random.shuffle(tag_feature) tag_feature = np.array(tag_feature) tag_feature_feature_train, tag_feature_feature_eval = tag_feature[:TRAIN_EVAL_THRESOLD], tag_feature[ TRAIN_EVAL_THRESOLD:] del tag_feature random.seed(randnum) random.shuffle(media_category_feature) media_category_feature = np.array(media_category_feature) media_category_feature_train, media_category_feature_eval = media_category_feature[:TRAIN_EVAL_THRESOLD], media_category_feature[ TRAIN_EVAL_THRESOLD:] del media_category_feature random.seed(randnum) random.shuffle(rate_discretize_feature) rate_discretize_feature = np.array(rate_discretize_feature) rate_discretize_feature_train, rate_discretize_feature_eval = rate_discretize_feature[:TRAIN_EVAL_THRESOLD], rate_discretize_feature[ TRAIN_EVAL_THRESOLD:] del rate_discretize_feature random.seed(randnum) random.shuffle(weight_feature) weight_feature = np.array(weight_feature, "float32") weight_feature_train, weight_feature_eval = weight_feature[:TRAIN_EVAL_THRESOLD], weight_feature[ TRAIN_EVAL_THRESOLD:] del weight_feature position_feature = np.array(position_feature) position_feature_train, position_feature_eval = position_feature[:TRAIN_EVAL_THRESOLD], position_feature[ TRAIN_EVAL_THRESOLD:] del position_feature random.seed(randnum) random.shuffle(y) y = np.array(y) y_train, y_eval = y[:TRAIN_EVAL_THRESOLD], y[TRAIN_EVAL_THRESOLD:] del y # input_fn train_input_x = { "first_category": first_category_feature_train, "second_category": second_category_feature_train, "tag": tag_feature_feature_train, "media": media_category_feature_train, "rate_discretize": rate_discretize_feature_train, "weight": weight_feature_train, "position": position_feature_train } eval_input_x = { "first_category": first_category_feature_eval, "second_category": second_category_feature_eval, "tag": tag_feature_feature_eval, "media": media_category_feature_eval, "rate_discretize": rate_discretize_feature_eval, "weight": weight_feature_eval, "position": position_feature_eval } return train_input_x, eval_input_x, y_train, y_eval
def load_vocab(file_name): data = cjdpy.load_csv(file_name) w2id = {line[0]: int(line[1]) for line in data if len(line) == 2} id2w = {int(line[1]): line[0] for line in data if len(line) == 2} return w2id, id2w
config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 set_session(tf.Session(config=config)) # parameters embedding_dims = 200 batch_size = 32 epochs = 20 path_windows = r'\\10.141.208.22\data\Chinese_isA\corpus\wikicorpus_seg.txt' path_linux = 'wikicorpus_seg.txt' # dataset texts, vocabulary = [], [] # data = cjdpy.load_csv(path_windows) data = cjdpy.load_csv(path_linux) for i, item in enumerate(data): if i > 50000: break vocabulary += list(item) texts.append(item) vocabulary = set(vocabulary) vocab_size = len(vocabulary) print('vocabulary size: ', vocab_size) word2id, id2word = {}, {} for idx, word in enumerate(vocabulary): word2id[word] = idx id2word[idx] = word