def get_userlist(path, logpath=None): #获取用户id列表,返回list if os.path.exists(path): return util.load2list(path) else: ul = util.load2list(logpath, get1column=0) util.list2txt(ul, path) return ul
def get_fnlist(path, logpath): #获取文件名列表,返回list if os.path.exists(path): return util.load2list(path) else: ul = util.load2list(logpath, to1column=True, start=1) res = list(set(ul)) util.list2txt(res, path) return res
def get_samplevec_gensimmodel(vecpath1, vecpath2, samplefile, prefix, respath='./', stopcnt=100, progress_per=10000): #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec] data, labels, realexamp = [], [], [] logger.info('loading vecfile : %s' % vecpath1) # muser=Doc2Vec.load(usermodel) v_user = load_vec(vecpath1) logger.info('loading vecfile : %s' % vecpath2) v_file = load_vec(vecpath2) samples = util.load2list(samplefile) for cnt, exam in enumerate(samples): if cnt % progress_per == 0: print("getting example vecs : %d" % cnt) if stopcnt and stopcnt == cnt: break exam = exam.strip().split() label0 = exam[1] uid = '*dt_' + exam[0].split("+")[0] fn = '*dt_' + exam[0].split("+")[1] if uid in v_user and fn in v_file: uvec = list(v_user[uid]) fvec = list(v_file[fn]) sampvec = uvec + fvec #拼接 realexamp.append(exam[0]) data.append(sampvec) labels.append(label0) del v_file del v_user np.savetxt('%s/exampvecs_%s.txt' % (respath, prefix), np.array(data)) util.list2txt(realexamp, '%s/realexamples_%s.txt' % (respath, prefix))
def data2csv(): fnfeatpath = './data/highq_5w/fn18_5w_features.txt' fnfeas = uc.load2list(fnfeatpath) fns, cites, cites_w, authcodes, fundcodes, jigoucodes, productcodes, dates, pages, downs, citeds, ifs = [], [], [], [], [], [], [], [], [], [], [], [] for i in fnfeas: if type(i) is str: iss = i.split() if len(iss) == 14: fns.append(iss[0]) cites.append(iss[1]) cites_w.append(iss[2]) authcodes.append(iss[3]) fundcodes.append(iss[4]) jigoucodes.append(iss[5]) productcodes.append(iss[6]) dates.append(iss[7]) pages.append(iss[8]) downs.append(iss[9]) citeds.append(iss[10]) ifs.append(iss[11]) exs = pd.DataFrame({ 'fns': fns, 'cites': cites, 'cites_w': cites_w, 'authcodes': authcodes, 'fundcodes': fundcodes, 'jigoucodes': jigoucodes, 'productcodes': productcodes, 'dates': dates, 'pages': pages, 'downs': downs, 'citeds': citeds, 'ifs': ifs }) exs.to_csv('./data/highq_5w/fn18_5w_features.csv')
def get_highquality_ulog(inpath, outpath, actmin=2, actmax=300): #优质用户历史,操作数>2 <300(操作太多可能是爬虫) oldulog = util.load2list(inpath) newulog = [] for l in oldulog: ws = l.strip().split()[1:] #每一行第一个是id if actmax > len(ws) > actmin: newulog.append(l) util.list2txt(newulog, outpath)
def getfiledtop(cnter, filedfile, top=50): ''' 按filedfile里词的词频排序 :param cnter: counter of all words :type cnter: Counter :param filedfile: :param top: :return: ''' worddic = {} inwords = util.load2list(filedfile) for i in inwords: if cnter.has_key(i): worddic[i] = cnter[i] newcnter = Counter(worddic) top = min(len(newcnter), top) topnwords = ["%s %d" % (i, c) for (i, c) in newcnter.most_common(top)] respath = "%s_top%d.txt" % (os.path.splitext(filedfile)[0], top) util.list2txt(topnwords, respath) return topnwords
def get_intersec_log(user_interseclist, alllog_b, alllog_d, prefix, rootpath=datapath): ''' 获取用户d,b日志的交集用户,并获取这群用户的d,b以及b-d日志分别储存 :param user_interseclist: :type user_interseclist: :param alllog_b: :type alllog_b: :param alllog_d: :type alllog_d: :param prefix: :type prefix: :return: :rtype: ''' blog = util.load2dic(alllog_b) # dlog=util.loadjson(alllog_d) dlog = util.load2dic(alllog_d) userb = blog.keys() userd = dlog.keys() if not os.path.exists(user_interseclist): logger.info("caculating two logs` intersection user...") uintersec = list(set(userb).intersection(set(userd))) util.list2txt(uintersec, user_interseclist) else: logger.info("loading two logs` intersection user file : %s" % user_interseclist) uintersec = util.load2list(user_interseclist) interseced_d = get_sub_dic(dlog, uintersec) interseced_b = get_sub_dic(blog, uintersec) del dlog del blog # interseced_dbdiff = get_dic_diff(interseced_b, interseced_d) logger.info("saving ress...") util.savejson("%s/%s_posi.json" % (rootpath, prefix), interseced_d) util.savejson("%s/%s_neg.json" % (rootpath, prefix), interseced_b) # util.savejson("%s/%s_dbdiff.json" %(rootpath,prefix), interseced_dbdiff) logger.info("done!")
def get_vec_gensimmodel(vecpath1, samplefile, prefix, respath='./', stopcnt=100, progress_per=10000): #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec] resdata = {} logger.info('loading vecfile : %s' % vecpath1) v_user = load_vec(vecpath1) samples = util.load2list(samplefile) for cnt, exam in enumerate(samples): if cnt % progress_per == 0: print("getting example vecs : %d" % cnt) # if stopcnt and stopcnt==cnt: # break examid = exam.strip() uid = '*dt_' + examid if uid in v_user: uvec = list(v_user[uid]) resdata[examid] = uvec with open(respath + '/' + prefix + '.pkl', 'wb') as f: pickle.dump(resdata, f)
def mergefns(path1, path2, respath): la = util.load2list(path1) lb = util.load2list(path2) res = list(set(la).union(set(lb))) util.list2txt(res, respath)
def exam2traindata_(examples, vecdic_users, vecdic_fns, fn_features, process_per=50000, earlystop=100, resname='traindata_nofnvec_01.pkl', encoding=True, sample=''): #loading data examps = uc.load2list(examples) # examps_posi=examps[:641683] # examps_neg=examps[641683:] # if examplimit: # examps=examps[:examplimit] if sample: # sampling # split example to X-Y for sampleing examps_X, examps_Y = [], [] for i in examps: (uid, fn, label) = i.strip().split('+') examps_X.append('%s+%s' % (uid, fn)) examps_Y.append(int(label)) logger.info("raw example y:") logger.info(Counter(examps_Y)) if sample is 'up': ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_sample( np.array(examps_X).reshape(-1, 1), examps_Y) print(X_resampled.shape) X_resampled = X_resampled.reshape(-1) print(X_resampled.shape) elif sample is 'down': rus = RandomUnderSampler(random_state=0, replacement=True) X_resampled, y_resampled = rus.fit_sample( np.array(examps_X).reshape(-1, 1), examps_Y) print(X_resampled.shape) X_resampled = X_resampled.reshape(-1) print(X_resampled.shape) elif sample is 'simpledown': #前641683个是正样本,后287464个是负样本。 X_resampled, y_resampled = examps_X[:287464] + examps_X[ 641683:], examps_Y[:287464] + examps_Y[641683:] else: logger.info('sample methord not found : %s' % sample) X_resampled, y_resampled = examps_X, examps_Y examps_X, examps_Y = X_resampled, y_resampled logger.info("after example y:") logger.info(Counter(examps_Y)) # concat X & Y for forther featear extraction examps_new = [] for x, y in zip(examps_X, examps_Y): examps_new.append('%s+%s' % (x, str(y))) examps = examps_new logger.info('generating features for model...') train_exs, test_exs = train_test_split(examps, test_size=0.3, random_state=2) # encoding features #loading features vecdicu = uc.pickle_load(vecdic_users) if isinstance(vecdic_users, str) else vecdic_users # vecdicf=uc.pickle_load(vecdic_fns) if isinstance(vecdic_fns,str) else vecdic_fns logger.info('loadding fn_features...') fnfeats = pd.read_csv(fn_features) if isinstance(fn_features, str) else fn_features fn_indexdic = {} for index, fn in enumerate(list(fnfeats.fns)): fn_indexdic[fn] = index fe_nums4onehot = ['cites', 'cites_w', 'citeds', 'downs', 'pages', 'ifs'] fe_strs4onehot = ['productcodes'] fe_strs4mulhot = ['fundcodes'] if encoding: logger.info('encoding onehot for number features') onehots_num, onehots_num_model = col2onehot_numbers( fnfeats[fe_nums4onehot]) # logger.info('encoding onehot for str features') # onehots_str,onehots_str_model=col2onehot_str(fnfeats[fe_strs4onehot[0]]) logger.info('encoding multihot for str features') mulhots_str, mulhots_model = col2multibinar(fnfeats[fe_strs4mulhot[0]]) logger.info('training data split get traindata %d, testdata %d' % (len(train_exs), len(test_exs))) def examples2x_y(exampls, ifencoding=encoding): X, Y = [], [] for index, ex in enumerate(exampls): if index % process_per == 0: logger.info('examples2x_y process %d' % index) (uid, fn, label) = ex.strip().split('+') if fn in fn_indexdic: fnindex = fn_indexdic[fn] else: # print('fn not in fnfeatures %s' %fn) continue uidvec = np.array(vecdicu[uid]) fnvec = [0] #np.array(vecdicf[fn]) if ifencoding: x = np.concatenate((uidvec, fnvec, onehots_num[fnindex], mulhots_str[fnindex])) else: feature_notencoded = list(fnfeats.iloc[fnindex][[ 'fns', 'cites', 'cites_w', 'citeds', 'downs', 'pages', 'ifs' ]]) x = np.concatenate((uidvec, fnvec, feature_notencoded[1:])) y = int(label) X.append(x) Y.append(y) return X, Y x_train, y_train = examples2x_y(train_exs) #获取训练集特征 x_test, y_test = examples2x_y(test_exs) #获取测试集特征 logger.info('training data actrauly get traindata %d, testdata %d' % (len(y_train), len(y_test))) alldata = [x_train, x_test, y_train, y_test] if resname: traindatapath = os.path.join(path.path_datahighq5w, resname) if not os.path.exists(traindatapath): uc.pickle_dump(alldata, traindatapath) else: newtraindatapath = os.path.join(path.path_datahighq5w, 'newres_%s' % resname) logger.info( 'triandatapath %s allready exists,save this batch traindata to %s' % (traindatapath, newtraindatapath)) uc.pickle_dump(alldata, newtraindatapath) return x_train, x_test, y_train, y_test
def exam2features(examples, vecdic_users, vecdic_fns, fn_features, respath=None, process_per=10000, earlystop=100): examps = uc.load2list(examples) vecdicu = uc.pickle_load(vecdic_users) if isinstance(vecdic_users, str) else vecdic_users vecdicf = uc.pickle_load(vecdic_fns) if isinstance(vecdic_fns, str) else vecdic_fns logger.info('loadding fn_features...') fnfeats = pd.read_csv(fn_features) if isinstance(fn_features, str) else fn_features fe_nums4onehot = ['cites', 'cites_w', 'citeds', 'downs', 'pages', 'ifs'] fe_strs4onehot = ['productcodes'] fe_strs4mulhot = ['fundcodes'] logger.info('encoding onehot for number features') onehots_num, onehots_num_model = col2onehot_numbers( fnfeats[fe_nums4onehot]) logger.info('encoding onehot for str features') onehots_str, onehots_str_model = col2onehot_str(fnfeats[fe_strs4onehot[0]]) logger.info('encoding multihot for str features') mulhots_str, mulhots_model = col2multibinar(fnfeats[fe_strs4mulhot[0]]) fn_indexdic = {} for index, fn in enumerate(list(fnfeats.fns)): fn_indexdic[fn] = index features = [] logger.info('generating features for model...') for index, ex in enumerate(examps): if index % process_per == 0: logger.info('process %d' % index) (uid, fn, label) = ex.strip().split('+') fnindex = fn_indexdic[fn] uidvec = np.array(vecdicu[uid]) fnvec = np.array(vecdicf[fn]) features.append( InputFeature2(uniqid=index, uid=uid, fn=fn, uidvec=uidvec, fnvec=fnvec, fnnumsf=onehots_num[fnindex], fnfuncode=mulhots_str[fnindex], fnprodcode=onehots_str[fnindex], label=label)) if index < 10: #打印前10个样本 logger.info("\n*** Example ***") logger.info('uniqid=%d' % index) logger.info('uid=%s' % uid) logger.info('fn=%s' % fn) logger.info('uidvec=%s' % (' '.join([str(i) for i in uidvec]))) logger.info('fnvec=%s' % (' '.join([str(i) for i in fnvec]))) logger.info('fnnumsf=%s' % (' '.join([str(i) for i in onehots_num[fnindex]]))) logger.info('fnfuncode=%s' % (' '.join([str(i) for i in mulhots_str[fnindex]]))) logger.info('fnprodcode=%s' % (' '.join([str(i) for i in onehots_str[fnindex]]))) logger.info('label=%s' % str(label)) if earlystop and index == earlystop - 1: break if respath: uc.pickle_dump(features, respath=respath) return features