コード例 #1
0
def conCount():
    data = cjdpy.load_csv(train_file)
    vocab_list = [item[3].split('|||') for item in data if len(item) == 4]
    vocab_list = sum(vocab_list, [])
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, con_file)
コード例 #2
0
ファイル: make_data.py プロジェクト: chenjindong/ML
def get_vid_embedding():
    # 离线计算kandian_today中视频的embedding
    print("get vid embedding begin")
    kandian_today = video_meta_to_id("../data/kandian_today.json")
    pred_data = cjdpy.load_csv("../data/pred.txt")

    input_x, _ = get_predict_data_serving(
        pred_data, 1, kandian_today)  # use behavior seq 随意选择即可

    kandian = cjdpy.load_list("../data/kandian_today.json")
    kandian_str = []
    for line in kandian:
        line = json.loads(line)
        kandian_str.append(" ".join([
            line["first_category"], line["second_category"], line["tags"],
            line["media_name"]
        ]))

    idx_rate = []
    cmsid_embed = []
    cmsid_set = set()
    cmsid_list = []
    bad_case = 0
    for i in range(len(input_x["first_category"])):
        dict_data = {
            "instances": [{
                "first_category": input_x["first_category"][i],
                "second_category": input_x["second_category"][i],
                "tag": input_x["tag"][i],
                "media": input_x["media"][i],
                "rate_discretize": input_x["rate_discretize"][i],
                "position": input_x["position"][i]
            }]
        }

        try:
            resp = requests.post('http://localhost:8515/v1/models/ttm:predict',
                                 json=dict_data)
            res = json.loads(resp.text)
            idx_rate.append([i] + res["predictions"][0]["ie"])
            cmsid_val = json.loads(kandian[i])["cmsid"]
            if cmsid_val in cmsid_set: continue
            cmsid_embed.append(res["predictions"][0]["ie"])
            cmsid_set.add(cmsid_val)
            cmsid_list.append(cmsid_val)
        except:
            bad_case += 1
        # if "predictions" not in res:
        #     bad_case += 1
        #     continue

        if i % 5000 == 0:
            print("process", i)

    print("#fail to request tf serving", bad_case)

    cjdpy.save_csv(idx_rate, "ie.txt")
    cjdpy.save_csv(cmsid_embed, "cmsid_embedding.txt", " ")
    cjdpy.save_lst(cmsid_list, "cmsid.txt")
    print("get vid embedding done")
コード例 #3
0
def wordCount():
    data = cjdpy.load_csv(train_file)
    X = [list(jieba.cut(item[0])) for item in data]
    vocab_list = [word for text in X for word in text]
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, vocab_file)
コード例 #4
0
def train_test_data():
    data = cjdpy.load_csv('data/topic_classification/text.con.txt')
    train_file = 'data/topic_classification/train.txt'
    test_file = 'data/topic_classification/test.txt'
    random.shuffle(data)
    line = 10000
    cjdpy.save_csv(data[:line], test_file)
    cjdpy.save_csv(data[line:], train_file)
コード例 #5
0
def word_count():
    data = cjdpy.load_csv(train_file)
    X, y = [], []
    for item in data:
        X.append(list(jieba.cut(item[0])))
        # y.append(item[1])
    vocab_list = [word for text in X for word in text]
    counter = Counter(vocab_list)
    counter = cjdpy.sort_list_by_freq(counter)
    cjdpy.save_csv(counter, vocab_file)
    assert False
コード例 #6
0
def get_concept():
    data = cjdpy.load_csv('st_ent.txt')
    res = []
    for i, (text, lable, ents) in enumerate(data):
        if i % 500 == 0: print('processing %d...' % i)
        cons = []
        for item in ents.split('|||'):
            url = 'http://shuyantech.com/api/cnprobase/concept?q=%s&apikey=%s' % (
                item, 'ljqljqljq')
            response = json.loads(requests.get(url).text)
            cons += [item[0] for item in response.get('ret', [])]
        if len(cons) == 0: cons = ['PAD']
        res.append([text, lable, ents, '|||'.join(cons)])
    cjdpy.save_csv(res, 'text.con.txt')
コード例 #7
0
    for j in range(len(chars)-1):
        ngram.append(chars[j])
        ngram.append(chars[j] + chars[j+1])
    fname = ' '.join(ngram)
    return fname

import time
start = time.time()
data = cjdpy.load_csv("data/original_data")
random.shuffle(data)
res = []
for i in range(len(data)):
    res.append([get_feature(data[i][1]), "__label__"+data[i][0]])


# data format
# 毛 毛毛 毛 毛虫 虫 虫的 的 的意 意 意见    __label__1
cjdpy.save_csv(res[:300000], "data/train.txt")
cjdpy.save_csv(res[300000:], "data/test.txt")
classifier = fasttext.train_supervised("data/train.txt")

def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))
            
print_results(*classifier.test('data/test'))

pred = classifier.predict(pred_list)
print(time.time()-start)