def main(): search_par_h = open("data/search_params.csv", "w") writer = DictWriter(search_par_h, fieldnames=["SearchID", "SearchParams"]) writer.writeheader() for t, row in read_tsv("data/SearchInfo.tsv"): sparams = row["SearchParams"] if not sparams: continue sid = int(row["SearchID"]) sparams = re.sub(r"([A-Za-z0-9]+):", r'"\1":', sparams) sparams = sparams.replace("'", "\"") sparams = sparams.replace("Минивэн\",", "\"Минивэн\",") sparams = sparams.replace("Микроавтобус\"]", "\"Микроавтобус\"]") sparams = unicode(sparams, "utf-8") try: sparams = json.loads(sparams) for k, v in sparams.items(): t = type(v) if t not in type_set: print t, k, v type_set.add(t) sparams_str = json.dumps(sparams) writer.writerow({"SearchID": sid, "SearchParams": sparams_str}) except Exception as e: print e print sparams
def main(): train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz) del_keys_set = ["HistCTR", "SearchID", "ObjectType"] for t, (data_type, rows, sinfo) in enumerate(data(train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter)): uid = int(sinfo["UserID"]) date_str = sinfo["SearchDate"] ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) rows = filter(lambda x : int(x["ObjectType"]) == 3, rows) for row in rows: for key in del_keys_set: del row[key] for key in row: row[key] = int(row[key]) if row[key] != "" else 0 item = ( ts, int(sinfo["SearchID"]), tuple([(row["AdID"], row["IsClick"], row["Position"]) for row in rows]), ) uid_sid[uid].append(item) print "uid_sid: %s"%len(uid_sid) for uid in uid_sid: uid_sid[uid].sort() print "start user_cnt." file_name = "data/user_cnt_%s.csv"%args.sz with open(file_name, "w") as f: writer = DictWriter(f, fieldnames=["SearchID", "t_cnt", "bf_cnt", "af_cnt", "bf_3h_cnt", "af_3h_cnt", "bf_clk_cnt", "bag2", "bag1"]) writer.writeheader() for uid in uid_sid: all_se = uid_sid[uid] writer.writerows(get_rows(all_se)) os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name)) print "start user_aid_cnt." file_name = "data/user_aid_cnt_%s.csv"%args.sz with open(file_name, "w") as f: writer = DictWriter(f, fieldnames=["SearchID", "AdID", "clk_cnt", "show_cnt", "t_show_cnt", "pos_clk_cnt", "pos_show_cnt"]) writer.writeheader() for uid in uid_sid: all_se = uid_sid[uid] writer.writerows(get_aid_rows(uid, all_se)) os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name))
def get_user_info(): user_info_map = {} for t, row in read_tsv("data/UserInfo.tsv"): for k in row: row[k] = int(row[k]) uid = row["UserID"] del row["UserID"] user_info_map[uid] = row return user_info_map
def main(args: argparse.Namespace) -> None: x, y = util.read_tsv(args.test) with open(args.model, "rb") as source: classifier = pickle.load(source) correct = 0 total = 0 for y, yhat in zip(y, classifier.predict(x)): if y == yhat: correct += 1 total += 1 print(f"Accuracy:\t{correct / total:.4f}")
def scan_match(sample_ui_list, path_list, comp_func, weight_list=None, threshold=0.6): """ :param sample_ui_list: output after process_tsv() :param path_list: relative or absolute path list of tsv files :param comp_func: compare function :param weight_list: ui weight mask :param threshold: threshold,超过一定的阈值才会被计算成相同组件 :return: best match path name """ logger = logging.getLogger("StreamLogger") out_dict = dict() for path in path_list: logger.debug(path) tmp_out = util.read_tsv(path) tmp_out = nlp_util.process_tsv(tmp_out) out_dict[os.path.basename(path)] = tmp_out count = 0 score_list = [] for j in range(len(path_list)): count += 1 j_file = os.path.basename(path_list[j]) name = path_list[j] if len(out_dict[j_file]) == 0: logger.debug(f"EMPTY {name}") score_distribution_list = [] continue else: score_distribution_list = match_name.weight_compare_list(sample_ui_list, out_dict[j_file], comp_func, weight_list) # score_distribution_list = util.get_col(score_distribution_list, 2) score = match_name.similar_index(score_distribution_list, threshold, col_index=2, rate=True) score_list.append((name, score, score_distribution_list)) logger.debug(f"ADD {count} {name}") # return sorted path^score^score_distribution_list list return sorted(score_list, key=lambda k: k[1], reverse=True)
def main(): random.seed(args.seed) xgb_set = set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s" % (args.test, args.type), "w"), open("data/cv_%s.%s" % (args.test, args.type), "w"), open("data/te_%s.%s" % (args.test, args.type), "w") ] else: fh_list = [ open("data/tr.%s" % (args.type), "w"), open("data/cv.%s" % (args.type), "w"), open("data/te.%s" % (args.type), "w") ] if args.sz is not None: train_iter = next_row(read_tsv("data/stream_%s.tsv" % args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv" % args.sz) data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl) else: data_iter = data(args.test, maxlines=args.maxl) print "sr: %s" % args.sr avg_ctr = defaultdict(lambda: [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump" % args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s" % k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s" % (key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
val = tuple([hash_val(0, (k, v)) for k, v in params.items()]) if len(val) == 0: val = (-1, ) ad_info[key] = val elif key == "Title": if not isinstance(ad_info[key], unicode): ad_info[key] = unicode(ad_info[key], "utf-8") else: if val == "": val = -1 ad_info[key] = int(val) return ad_info ad_info_list = [] ad_info_iter = read_tsv("data/AdsInfo.tsv") def get_ad_info(aid): while aid - 1 >= len(ad_info_list): t, row = next(ad_info_iter, (None, None)) if row is None: break ad_info_list.append(trans_ad_info(row)) return ad_info_list[aid - 1] se_params_iter = read_tsv("data/search_params.csv", delimiter=",") se_param_list = [None]
lt = [ 'Camera-Roll-Android-App-master', 'PocketHub-master', 'SimpleMobileTools_Simple_File_Manager_master', 'zapp-master', ] for nam in lt: # reload = util.Reload() _item = f"data/test_f/{nam}.tsv" # test code # src = "tsv/nextcloud_android_master.tsv" # src = select_dir(SRC_DIR) src = _item src_out = util.read_tsv(src) src_out = nlp_util.process_tsv(src_out) file_list = os.listdir(SRC_DIR) file_list = [os.path.join(SRC_DIR, f) for f in file_list] if src in file_list: file_list.remove(src) # file_list = ['tsv/owncloud_android_master.tsv'] # one test scan_output = scan_match(src_out, file_list, match_name.ngram_compare, [1, 0.5, 0.5], threshold=0.7) # 得到src app与数据库每个app的总相似度 logger.debug(pp.pformat(util.get_col(scan_output, [0, 1]))) rdb = issuedb.ISSuedb() sql = """select issue_num, comments, state, title, body, commit_id, labels from {} order by length(body) desc"""
if num_thread > 1: num_iter = 1 ################################################################# # Prepare dataset ################################################################# print('Preparing dataset {}'.format(dbname)) if dbname.startswith('SIFT'): dbsize = int(dbname[4:-1]) xb = util.mmap_bvecs('{}bigann_base.bvecs'.format(DB_DIR)) xq = util.mmap_bvecs('{}bigann_query.bvecs'.format(DB_DIR)) xt = util.mmap_bvecs('{}bigann_learn.bvecs'.format(DB_DIR)) # trim xb to correct size xb = xb[:dbsize * 1000 * 1000] gt = util.read_tsv('{}gtSIFT{}Mtest.tsv'.format(GT_DIR, dbsize)) if search_mode == 0 and train_size > 0 and binary_search == 1: # Take a sample from the training vector to find the minimum fixed # termination condition to reach different accuracy targets. This is # needed to choose the intermediate search result features when # generating training data. xq = xt[:10000] gt = util.read_tsv('{}gtSIFT{}Mtrain{}M.tsv'.format( GT_DIR, dbsize, train_size))[:10000] if search_mode == -2: xq = xt[:train_size * 1000 * 1000] gt = util.read_tsv('{}gtSIFT{}Mtrain{}M.tsv'.format( GT_DIR, dbsize, train_size)) elif dbname.startswith('DEEP'): dbsize = int(dbname[4:-1]) xb = util.mmap_fvecs('{}deep1B_base.fvecs'.format(DB_DIR))
params[par_key] = unicode(params[par_key], "utf-8") val = tuple([hash_val(0, (k, v)) for k, v in params.items()]) if len(val) == 0: val = (-1,) ad_info[key] = val elif key == "Title": if not isinstance(ad_info[key], unicode): ad_info[key] = unicode(ad_info[key], "utf-8") else: if val == "": val = -1 ad_info[key] = int(val) return ad_info ad_info_list = [] ad_info_iter = read_tsv("data/AdsInfo.tsv") def get_ad_info(aid): while aid - 1 >= len(ad_info_list): t, row = next(ad_info_iter, (None, None)) if row is None: break ad_info_list.append(trans_ad_info(row)) return ad_info_list[aid - 1] se_params_iter = read_tsv("data/search_params.csv", delimiter=",") se_param_list = [None] def get_se_param(sid): while se_param_list[0] is None or se_param_list[0]["SearchID"] < sid: t, se_param = next(se_params_iter, (None, None)) se_param["SearchID"] = int(se_param["SearchID"]) params = json.loads(se_param["SearchParams"])
if num_thread > 1: num_iter = 1 ################################################################# # Prepare dataset ################################################################# print('Preparing dataset {}'.format(dbname)) if dbname.startswith('SIFT'): dbsize = int(dbname[4:-1]) xb = util.mmap_bvecs('{}bigann_base.bvecs'.format(DB_DIR)) xq = util.mmap_bvecs('{}bigann_query.bvecs'.format(DB_DIR)) xt = util.mmap_bvecs('{}bigann_learn.bvecs'.format(DB_DIR)) # trim xb to correct size xb = xb[:dbsize * 1000 * 1000] gt = util.read_tsv('{}gtSIFT{}Mtest.tsv'.format(GT_DIR, dbsize)) if search_mode == -2: xq = xt[:train_size * 1000 * 1000] gt = util.read_tsv('{}gtSIFT{}Mtrain{}M.tsv'.format( GT_DIR, dbsize, train_size)) elif dbname.startswith('DEEP'): dbsize = int(dbname[4:-1]) xb = util.mmap_fvecs('{}deep1B_base.fvecs'.format(DB_DIR)) xq = util.mmap_fvecs('{}deep1B_query.fvecs'.format(DB_DIR)) xt = util.mmap_fvecs('{}deep1B_learn.fvecs'.format(DB_DIR)) # trim xb to correct size xb = xb[:dbsize * 1000 * 1000] gt = util.read_tsv('{}gtDEEP{}Mtest.tsv'.format(GT_DIR, dbsize)) if search_mode == -2: xq = xt[:train_size * 1000 * 1000] gt = util.read_tsv('{}gtDEEP{}Mtrain{}M.tsv'.format(
def main(args: argparse.Namespace) -> None: x, y = util.read_tsv(args.train) classifier = model.NameClassifier() classifier.train(x, y) with open(args.model, "wb") as sink: pickle.dump(classifier, sink)
def main(): random.seed(args.seed) xgb_set =set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), open("data/cv_%s.%s"%(args.test, args.type), "w"), open("data/te_%s.%s"%(args.test, args.type), "w")] else: fh_list = [open("data/tr.%s"%(args.type), "w"), open("data/cv.%s"%(args.type), "w"), open("data/te.%s"%(args.type), "w")] if args.sz is not None: train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz) data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl) else: data_iter = data(args.test, maxlines=args.maxl) print "sr: %s"%args.sr avg_ctr = defaultdict(lambda : [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump"%args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s"%k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s"%(key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
params[par_key] = unicode(params[par_key], "utf-8") val = tuple([hash_val(0, (k, v)) for k, v in params.items()]) if len(val) == 0: val = (-1,) ad_info[key] = val elif key == "Title": if not isinstance(ad_info[key], unicode): ad_info[key] = unicode(ad_info[key], "utf-8") else: if val == "": val = -1 ad_info[key] = int(val) return ad_info ad_info_list = [] ad_info_iter = read_tsv("data/AdsInfo.tsv") def get_ad_info(aid): while aid - 1 >= len(ad_info_list): t, row = next(ad_info_iter, (None, None)) if row is None: break ad_info_list.append(trans_ad_info(row)) return ad_info_list[aid - 1] def hash_val(t, v, dtype=None, D=22): if dtype == "xgb": return u"%s:%s"%(t, v) else: return (t << D) | (hash(unicode(v)) & ((1 << D) - 1)) se_params_iter = read_tsv("data/search_params.csv", delimiter=",")
val = tuple([hash_val(0, (k, v)) for k, v in params.items()]) if len(val) == 0: val = (-1, ) ad_info[key] = val elif key == "Title": if not isinstance(ad_info[key], unicode): ad_info[key] = unicode(ad_info[key], "utf-8") else: if val == "": val = -1 ad_info[key] = int(val) return ad_info ad_info_list = [] ad_info_iter = read_tsv("data/AdsInfo.tsv") def get_ad_info(aid): while aid - 1 >= len(ad_info_list): t, row = next(ad_info_iter, (None, None)) if row is None: break ad_info_list.append(trans_ad_info(row)) return ad_info_list[aid - 1] def hash_val(t, v, dtype=None, D=22): if dtype == "xgb": return u"%s:%s" % (t, v) else: