def conCount(): data = cjdpy.load_csv(train_file) vocab_list = [item[3].split('|||') for item in data if len(item) == 4] vocab_list = sum(vocab_list, []) counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, con_file)
def get_vid_embedding(): # 离线计算kandian_today中视频的embedding print("get vid embedding begin") kandian_today = video_meta_to_id("../data/kandian_today.json") pred_data = cjdpy.load_csv("../data/pred.txt") input_x, _ = get_predict_data_serving( pred_data, 1, kandian_today) # use behavior seq 随意选择即可 kandian = cjdpy.load_list("../data/kandian_today.json") kandian_str = [] for line in kandian: line = json.loads(line) kandian_str.append(" ".join([ line["first_category"], line["second_category"], line["tags"], line["media_name"] ])) idx_rate = [] cmsid_embed = [] cmsid_set = set() cmsid_list = [] bad_case = 0 for i in range(len(input_x["first_category"])): dict_data = { "instances": [{ "first_category": input_x["first_category"][i], "second_category": input_x["second_category"][i], "tag": input_x["tag"][i], "media": input_x["media"][i], "rate_discretize": input_x["rate_discretize"][i], "position": input_x["position"][i] }] } try: resp = requests.post('http://localhost:8515/v1/models/ttm:predict', json=dict_data) res = json.loads(resp.text) idx_rate.append([i] + res["predictions"][0]["ie"]) cmsid_val = json.loads(kandian[i])["cmsid"] if cmsid_val in cmsid_set: continue cmsid_embed.append(res["predictions"][0]["ie"]) cmsid_set.add(cmsid_val) cmsid_list.append(cmsid_val) except: bad_case += 1 # if "predictions" not in res: # bad_case += 1 # continue if i % 5000 == 0: print("process", i) print("#fail to request tf serving", bad_case) cjdpy.save_csv(idx_rate, "ie.txt") cjdpy.save_csv(cmsid_embed, "cmsid_embedding.txt", " ") cjdpy.save_lst(cmsid_list, "cmsid.txt") print("get vid embedding done")
def wordCount(): data = cjdpy.load_csv(train_file) X = [list(jieba.cut(item[0])) for item in data] vocab_list = [word for text in X for word in text] counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, vocab_file)
def train_test_data(): data = cjdpy.load_csv('data/topic_classification/text.con.txt') train_file = 'data/topic_classification/train.txt' test_file = 'data/topic_classification/test.txt' random.shuffle(data) line = 10000 cjdpy.save_csv(data[:line], test_file) cjdpy.save_csv(data[line:], train_file)
def word_count(): data = cjdpy.load_csv(train_file) X, y = [], [] for item in data: X.append(list(jieba.cut(item[0]))) # y.append(item[1]) vocab_list = [word for text in X for word in text] counter = Counter(vocab_list) counter = cjdpy.sort_list_by_freq(counter) cjdpy.save_csv(counter, vocab_file) assert False
def get_concept(): data = cjdpy.load_csv('st_ent.txt') res = [] for i, (text, lable, ents) in enumerate(data): if i % 500 == 0: print('processing %d...' % i) cons = [] for item in ents.split('|||'): url = 'http://shuyantech.com/api/cnprobase/concept?q=%s&apikey=%s' % ( item, 'ljqljqljq') response = json.loads(requests.get(url).text) cons += [item[0] for item in response.get('ret', [])] if len(cons) == 0: cons = ['PAD'] res.append([text, lable, ents, '|||'.join(cons)]) cjdpy.save_csv(res, 'text.con.txt')
for j in range(len(chars)-1): ngram.append(chars[j]) ngram.append(chars[j] + chars[j+1]) fname = ' '.join(ngram) return fname import time start = time.time() data = cjdpy.load_csv("data/original_data") random.shuffle(data) res = [] for i in range(len(data)): res.append([get_feature(data[i][1]), "__label__"+data[i][0]]) # data format # 毛 毛毛 毛 毛虫 虫 虫的 的 的意 意 意见 __label__1 cjdpy.save_csv(res[:300000], "data/train.txt") cjdpy.save_csv(res[300000:], "data/test.txt") classifier = fasttext.train_supervised("data/train.txt") def print_results(N, p, r): print("N\t" + str(N)) print("P@{}\t{:.3f}".format(1, p)) print("R@{}\t{:.3f}".format(1, r)) print_results(*classifier.test('data/test')) pred = classifier.predict(pred_list) print(time.time()-start)