def train(): with tf.Session() as sess: init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) test_feed_dict = { X: mnist.test.images.reshape(-1, 28, 28, 1), Y: mnist.test.labels, p_keep_conv: p_keep_conv_value, p_keep_hidden: p_keep_hidden_value } for epoch in range(training_epochs): total_batch = int(mnist.train.num_examples / batch_size) for step in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) feed_dict = { X: batch_xs.reshape(-1, 28, 28, 1), Y: batch_ys, p_keep_conv: p_keep_conv_value, p_keep_hidden: p_keep_hidden_value } train_op = build_model() sess.run(train_op, feed_dict=feed_dict) check_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(Y, 1)) accuracy = tf.reduce_mean(tf.cast(check_prediction, tf.float32)) accuracy_rates = sess.run(accuracy, feed_dict=test_feed_dict) log.info("{}".format('Epoch:', '%04d' % (epoch + 1), '/ Accuracy =', accuracy_rates))
def message_Clustering(): data = pd.read_csv("antbot/datasets/question_45/root_q_a").values.tolist() search_msg = [j for i in data for j in i] for hit_0 in tqdm(search_msg): root_msg = hit_0 body = { "query": { "bool": { "filter": { "match_phrase": { "q": "{}".format(root_msg) } } } } } questions_one = es.search(index="bot_entity_tmp_new", body=body, size=100) for hit_1 in questions_one['hits']['hits']: res = hit_1['_source']['q'] body = { "query": { "bool": { "filter": { "match_phrase": { "q": "{}".format(res) } } } } } questions_two = es.search(index="bot_entity_tmp", body=body, size=100) out = [] for hit_2 in questions_two['hits']['hits']: res_two = hit_2['_source'] rows = { 'q': str(res_two['q']).replace("\n", ""), 'a': str(res_two['a']).replace("\n", ""), 'roomId': (res_two['roomId']), 'tenantId': (res_two['tenantId']), 'lanlordId': (res_two['lanlordId']), 'id': str(res_two['id']).replace("\n", "") } out.append(rows) if len(out) < 1: continue df = pd.DataFrame(out) save_name = '{}.csv'.format(replace_symbol(out[0]['q'])) save_dir = "/home/duyp/mayi_datasets/seed/entity" if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, "{}".format(save_name)) log.info("{}".format(save_path)) df.to_csv(save_path, index=None)
def get_features_corr(): data = pd.read_csv( os.path.join("yancheng/datasets/results/train", "total_by_day_ex_sorted.csv")) X = data['cnt'] y = data['week'] corr = np.corrcoef(X, y) log.info("{}".format(corr))
def get_features(): data = pd.read_csv( os.path.join("yancheng/datasets/results", "total_by_day.csv")) X = data['cnt'] y = data['week'] corr = np.corrcoef(X, y) log.info("{}".format(corr)) data_grouped = data.groupby(by='week') for i, j in data_grouped: j.to_csv("yancheng/datasets/results/week/{}.csv".format(i), index=None)
def save_question_45_to_es(index_name="question_cd_update"): # 批量插入 try: es.indices.delete(index_name) log.info("{} have delete ".format(index_name)) setting = {"number_of_shards": 6, "number_of_replicas": 0} mapping = { "timestamp": { "enabled": "true" }, "properties": { "logdate": { "type": "date", "format": "dd/MM/yyy HH:mm:ss" } } } settings = {"settings": setting, "mapping": mapping} es.indices.create(index=index_name, ignore=400, body=settings) except: pass file_dir = "antbot/datasets/city_questions_740432.csv" if not os.path.isfile(file_dir): raise FileNotFoundError("没有数据文件") data = pd.read_csv(file_dir).values.tolist() line_number = 0 all_data = [] source = '' for m in tqdm(data): body = { '_index': '{}'.format(index_name), '_type': 'post', '_id': id, '_source': source } all_data.append(body) line_number += 1 if line_number % 10000 == 0: try: success, _ = bulk(es, all_data, index=index_name, raise_on_error=True) all_data = [] log.info( "==================== success :{}/{} ====================". format(line_number, len(data))) except Exception as e: log.debug("\n 存储失败! ")
def cut_data(): out = [] data_name = os.path.join(root_path, 'datasets/cd_by_nosplit.txt') with open(data_name, 'r') as fr: lines = fr.readlines() for line in tqdm(lines): line_cut = cut(replace_symbol(line), add_stopwords=True) for x in line_cut: out.append(x) log.info(" Length: {} ".format(len(out))) fw = open(os.path.join(root_path, "datasets/cd.txt"), 'w') fw.writelines(" ".join(out)) fw.close()
def geturls(): all_urls = [] files = ['shehui', 'tiyu', 'renwu', 'ziran', 'wenhua', 'lishi', 'dili', 'keji'] for file in files: rooturl = 'http://baike.baidu.com/{}'.format(file) log.info("{}".format(rooturl)) html = urlhelper(rooturl) soup = BeautifulSoup(html, "lxml") for a in soup.find_all('a', href=True): href = a['href'] if "view" in href: log.info("href: {}".format(href)) all_urls.append(href) hrefhtml = urlhelper(href) hrefsoup = BeautifulSoup(hrefhtml, "lxml") for a in hrefsoup.find_all('a', href=True): href = a['href'] if "view" in href and "http" not in href: nexthref = "http://baike.baidu.com" + href log.info("nexthref: {}".format(nexthref)) all_urls.append(nexthref) elif "view" in href and "http" in href: nexthref = href log.info("nexthref: {}".format(nexthref)) all_urls.append(nexthref) else: continue df = pd.DataFrame(all_urls) df.to_csv("./localdatasets/baike/urls.txt", index=None)
def GP(): data = pd.read_csv("./datasets/results/data_train.csv").values kernel = C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)) reg = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1) train_x, train_y = data[:, :-1], data[:, -1] log.info("{} {}".format(train_x.shape, train_y.shape)) reg.fit(train_x, train_y) test_data = pd.read_csv("./datasets/test_A_20171225.txt", sep="\t") # TODO 新数据的不断迭代训练模型,预测新结果 resuluts = [] for x in tqdm(test_data.values): predict_x = np.array([[x[0], x[1]]]) p = reg.predict(predict_x)[0] resuluts.append(int(p)) train_x_update = list_reverse_pop(train_x, x) new_train_x = np.array(train_x_update) train_y_update = list_reverse_pop(train_y, p) new_train_y = np.array(train_y_update) log.info("{}, {}".format(new_train_x.shape, new_train_y.shape)) reg.fit(new_train_x, new_train_y) log.info("predict: {}".format(resuluts)) test_data_copy = test_data.copy() test_data_copy['predict'] = resuluts test_data_copy.to_csv("./datasets/results/predict_GP.csv", index=None) log.info('{}'.format(resuluts))
def get_date_features(): root = 'yancheng/datasets/results/train/' log.info("Total files :{}".format(len(os.listdir(root)))) res = [] for file in tqdm(os.listdir(root)): rows = OrderedDict() file_name = os.path.join(root, file) data = pd.read_csv(file_name) cnt = np.sum(data['cnt'].values) rows['date'] = data['date'].values[0] rows['week'] = data['day_of_week'].values[0] rows['cnt'] = cnt res.append(rows) df = pd.DataFrame(res).sort_values(by='date') df.to_csv(os.path.join("yancheng/datasets/results", "total_by_day.csv"), index=None)
def conact_csv(): out = [] path = 'word2vector_code/datasets/rawdata' for file in tqdm(os.listdir(path)): data_name = os.path.join(path, file) with open(data_name, 'r') as fr: lines = fr.readlines() for line in lines: for x in line.split(): if x.isdigit(): continue out.append(x) log.info(" NEXT ") log.info("{}".format(len(out))) fw = open("word2vector_code/datasets/train.csv", 'w') fw.writelines(" ".join(out))
def nntest(model_dir="datasets/results/models", class_number=2): root = os.path.dirname(os.path.realpath(__file__)) pos = pd.read_csv( "Order_predicts/datasets/results/test/action_pos_features.csv") neg = pd.read_csv( "Order_predicts/datasets/results/test/action_neg_features.csv") data = pd.concat([pos, neg]) data = shuffle(data) ids = data['id'] data = data.fillna(-1).replace(np.inf, 100) del data['16_tmode'] del data['10_t9'] del data['28_tmode'] del data['27_atmedian'] del data['29_atptp'] del data['continent'] del data['province'] del data['country'] del data['city'] del data['age'] x_data_holder = tf.placeholder(tf.float32, [None, 33], name='inputs_x') y_prediction = neural_networks(x_data_holder, 33, class_number) for i in ids: batch_x = data[data['id'].isin([i])] del batch_x['id'] sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(os.path.join(root, model_dir)) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) y_pre = sess.run(y_prediction, feed_dict={x_data_holder: batch_x.values}) log.info("{}".format(y_pre)) normal, spam = y_pre[0][0], y_pre[0][1] log.info("{}, {}".format(normal, spam)) res = {} if normal > spam: res['pos'] = normal elif normal < spam: res['neg'] = spam print(res)
def model(input_tensor): log.info("input_tensor: {}".format(input_tensor)) with tf.device("/gpu:0"): weights = [] conv_00_w = tf.get_variable( "conv_00_w", [3, 3, 3, 64], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9))) conv_00_b = tf.get_variable("conv_00_b", [64], initializer=tf.constant_initializer(0)) weights.append(conv_00_w) weights.append(conv_00_b) tensor = tf.nn.relu( tf.nn.bias_add( tf.nn.conv2d(input_tensor, conv_00_w, strides=[1, 1, 1, 1], padding='SAME'), conv_00_b)) for i in range(18): conv_w = tf.get_variable("conv_%02d_w" % (i + 1), [3, 3, 64, 64], initializer=tf.random_normal_initializer( stddev=np.sqrt(2.0 / 9 / 64))) conv_b = tf.get_variable("conv_%02d_b" % (i + 1), [64], initializer=tf.constant_initializer(0)) weights.append(conv_w) weights.append(conv_b) tensor = tf.nn.relu( tf.nn.bias_add( tf.nn.conv2d(tensor, conv_w, strides=[1, 1, 1, 1], padding='SAME'), conv_b)) conv_w = tf.get_variable("conv_20_w", [3, 3, 64, 1], initializer=tf.random_normal_initializer( stddev=np.sqrt(2.0 / 9 / 64))) conv_b = tf.get_variable("conv_20_b", [1], initializer=tf.constant_initializer(0)) weights.append(conv_w) weights.append(conv_b) tensor = tf.nn.bias_add( tf.nn.conv2d(tensor, conv_w, strides=[1, 1, 1, 1], padding='SAME'), conv_b) tensor = tf.add(tensor, input_tensor) log.info("out tensor :{}".format(tensor)) return tensor, weights
def randomforest(): # 特征选择 pos = pd.read_csv( "Order_predicts/datasets/results/train/action_pos_features.csv") posfillna = pos.fillna(pos.median()).replace(np.inf, 100) neg = pd.read_csv( "Order_predicts/datasets/results/train/action_neg_features.csv") negfillna = neg.fillna(neg.median()).replace(np.inf, 100) data = pd.concat([posfillna, negfillna]) data = shuffle(data) data.to_csv("Order_predicts/datasets/results/train.csv", index=None) log.info("train data save succes ...") del data['id'] Y = data['label'] del data['label'] X = data names = data.columns rf = RandomForestRegressor(n_estimators=10, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="log2", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=1, verbose=0, warm_start=False) rf.fit(X, Y) res = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), reverse=True) for x in res: log.info("{}: {}".format(x[0], x[1]))
def read_newdata(): path = '/home/duyp/mayi_datasets/question/question_new' number = 0 for file in os.listdir(path): filename = os.path.join(path, file) data = pd.read_csv(filename, lineterminator="\n").values out = [] for x in tqdm(data): msg = x[0] if isinstance(msg, str): msgcut = cut(replace_symbol(msg), add_stopwords=True) for i in msgcut: out.append(i) number += 1 else: continue fw = open(os.path.join(root_path, "datasets/rawdata/{}".format(file)), 'w') fw.writelines(" ".join(out)) fw.close() log.info("{}".format(number))
def linear_model(): data = pd.read_csv( "yancheng/datasets/results/train/total_by_day_ex_sorted.csv") x, y = data['week'].values, data['cnt'].values train_x = np.array(x[:int(len(x) * 0.8)]).reshape(782, 1) train_y = np.array(y[:int(len(x) * 0.8)]).reshape(782, 1) test_x = np.array(x[int(len(x) * 0.8):]).reshape(196, 1) test_y = np.array(x[int(len(x) * 0.8):]).reshape(196, ) scaler = preprocessing.StandardScaler().fit(train_x) train_x = scaler.transform(train_x).reshape(782, 1) scaler = preprocessing.StandardScaler().fit(train_y) train_y = scaler.transform(train_y).reshape(782, ) lm = LinearRegression() lm.fit(train_x, train_y) joblib.dump(lm, "yancheng/datasets/results/linear_model.m") score = lm.score(test_x, test_y) mse = mean_squared_error(test_y, lm.predict(test_x)) # log.info("{}".format(score, mse)) log.info("{}, {}".format(score, mse))
def modeltest(): batch_size = 4 run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True sess = tf.Session(config=run_config) is_training = tf.placeholder(tf.bool, []) x = tf.placeholder(tf.float32, [None, image_size, image_size, 3]) downscaled = downscale(x) imitation, G_vars = generator(downscaled, is_training, False) real_output, _ = discriminator(x, is_training, False) fake_output, D_vars = discriminator(imitation, is_training, True) saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state('./checkpoints/srgan_new/') if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) log.info("Model load success ... {}".format( ckpt.model_checkpoint_path)) x_test = np.load('yaogantest.npy') log.info("{}".format(x_test.shape)) k = 0 epoch = 100 for x_batch_images in minibatches(inputs=x_test, batch_size=batch_size): raw = normalize(x_batch_images) mos, fake = sess.run([downscaled, imitation], feed_dict={ x: raw, is_training: False }) log.info("{},{},{}".format(mos.shape, fake.shape, raw.shape)) imgs = [mos, fake, raw] for i in range(batch_size): fig = plt.figure(figsize=(290, 110)) label = ['输入', '输出', '原始图像'] for j, img in enumerate(imgs): im = np.uint8((img[i] + 1) * 127.5) ax = fig.add_subplot(1, len(imgs), j + 1) plt.imshow(im) plt.tick_params(labelbottom='off') plt.tick_params(labelleft='off') plt.gca().get_xaxis().set_ticks_position('none') plt.gca().get_yaxis().set_ticks_position('none') ax.set_xlabel(label[j]) epoch_ = "{0:09d}".format(epoch) path = os.path.join('result', '{}_{}_{}.jpg'.format(k, i, epoch_)) plt.show() plt.close() k += 1
def sort_file_by_dict(data_dir, input_filename, output_filename, delete=True): """ 输出文件和输入文件保存在同一目录下 :param data_dir: 数据根目录 :param input_filename: 要排序文件的名字 :param output_filename: 输出文件的名字 :param delete: 是否删除标点符号 :return: 0 """ locale.setlocale(locale.LC_ALL, locale='zh_CN.UTF-8') files = [] line_number = 0 inputs_dir = os.path.join(data_dir, input_filename) with open(inputs_dir, 'r') as fr: lines = fr.readlines() for line in lines: if delete: line_new = replace_symbol( line.replace("\n", '').lstrip().rstrip().strip()) if len(line_new) > 1: files.append(line_new) line_number += 1 # TODO 或者可以隔500000保存一次,加快保存速度. if line_number % 10000 == 0: log.info( "=============== process : {} ===============". format(line_number)) else: line_new = line.replace("\n", '').lstrip().rstrip().strip() files.append(line_new) line_number += 1 if line_number % 10000 == 0: pass log.info(" Total lines : {}".format(line_number)) b = sorted(files, key=cmp_to_key(locale.strcoll)) df = pd.DataFrame(b) df.columns = ['message'] output_dir = os.path.join(data_dir, output_filename) log.info("Save file : {}".format(output_dir)) df.to_csv(output_dir, index=None)
staircase=True) tf.summary.scalar("learning rate", learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) opt = optimizer.minimize(loss, global_step=global_step) saver = tf.train.Saver(weights, max_to_keep=0) config = tf.ConfigProto() with tf.Session(config=config) as sess: if not os.path.exists('logs'): os.mkdir('logs') merged = tf.summary.merge_all() file_writer = tf.summary.FileWriter('logs', sess.graph) tf.initialize_all_variables().run() for epoch in trange(0, max_epoch): log.info( " ................... Start Training ...................") batch_count = train_list_length // batch_size log.info("{}".format(batch_count)) for bc in range(batch_count): offset = bc * batch_size for hr, lr in get_image_batch_forpng(bc, batch_size): input_data, gt_data = read_data2arr(lr), read_data2arr( hr) log.debug("{}, {}".format(input_data.shape, gt_data.shape)) feed_dict = { train_input: input_data, train_gt: gt_data } run_obj = [ opt, loss, train_output, learning_rate, global_step
prob = clf_weights.predict(batch_x.values) except: prob = clf_weights.predict_proba(batch_x.values)[0][1] # log.info("{}, {:0.8f}".format(p[0], prob)) df_push.loc[linenumber, 'userid'] = int(i) df_push.loc[linenumber, 'orderType'] = "{}".format(prob) linenumber += 1 df_push.to_csv("Order_predicts/datasets/results_push.csv", index=None) if __name__ == "__main__": import sys if len(sys.argv) < 2: exit(1) method = sys.argv[1] if method == 'select': randomforest() if method == 'train': m_names = [ 'svm', 'svr', 'lasso', 'mlpr', 'rf', 'adaboost', 'gbr', 'qda', 'lda', 'n_n', 'gnb', 'bnb', 'dcc', 'RAN', 'SGDR' ] log.info("Total number models: {}".format(len(m_names))) train_models(model_name='logistic', epoch=50, batch_size=2000) if method == "test": modeltest(model_name='logistic')
def train_models(model_name, epoch=5, batch_size=100): log.info("current model:{}".format(model_name)) pos = pd.read_csv( "Order_predicts/datasets/results/train/action_pos_features.csv") posfillna = pos.fillna(pos.median()).replace(np.inf, 100) neg = pd.read_csv( "Order_predicts/datasets/results/train/action_neg_features.csv") negfillna = neg.fillna(neg.median()).replace(np.inf, 100) data = pd.concat([posfillna, negfillna]) data = shuffle(data) del data['id'] y = data['label'] del data['label'] scaler = preprocessing.StandardScaler().fit(data) X = scaler.transform(data) pd.DataFrame(X).to_csv("Order_predicts/datasets/results/scale_x.csv", index=None) data_scaled = preprocessing.scale(X) log.info("data shape: {}".format(data_scaled.shape)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) log.info("{}, {}".format(X_train.shape, X_test.shape)) i = 0 for e in range(epoch): for train_x, train_y in minibatches(X_train, y_train, batch_size=batch_size, shuffle=False): if model_name == 'svc': clf_weights = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight={1: 10}, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=0) elif model_name == 'svr': clf_weights = svm.SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif model_name == 'lasso': clf_weights = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, positive=False, random_state=0, selection='cyclic') elif model_name == 'logistic': clf_weights = LogisticRegression(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight={ 0: 0.1, 1: 0.9 }, random_state=0, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) elif model_name == 'mlpr': # learning_rate: {'constant', 'invscaling', 'adaptive'} clf_weights = MLPRegressor(hidden_layer_sizes=(100, ), activation="logistic", solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=0, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8) elif model_name == 'rf': clf_weights = RandomForestClassifier( n_estimators=20, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=0, verbose=0, warm_start=False, class_weight={ 0: 0.1, 1: 0.9 }) elif model_name == 'adaboost': base_estimator = RandomForestClassifier( n_estimators=20, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=0, verbose=0, warm_start=False, class_weight={ 0: 0.1, 1: 0.9 }) base_estimator1 = LogisticRegression(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight={ 0: 0.1, 1: 0.9 }, random_state=0, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) clf_weights = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, learning_rate=0.6666, algorithm='SAMME.R', random_state=0) elif model_name == 'gbr': clf_weights = GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=0, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') elif model_name == 'qda': clf_weights = QuadraticDiscriminantAnalysis( priors=None, reg_param=0., store_covariance=False, tol=1.0e-4, store_covariances=None) elif model_name == 'lda': clf_weights = LinearDiscriminantAnalysis( solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4) elif model_name == 'n_n': clf_weights = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1) elif model_name == 'gnb': clf_weights = GaussianNB(priors=None) elif model_name == 'bnb': clf_weights = BernoulliNB(alpha=1.0, binarize=.0, fit_prior=True, class_prior=None) elif model_name == 'dcc': clf_weights = DecisionTreeClassifier( criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, class_weight=None, presort=False) elif model_name == 'dcr': clf_weights = DecisionTreeRegressor( criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, presort=False) elif model_name == 'RAN': base_estimator = LinearRegression() clf_weights = RANSACRegressor(base_estimator=base_estimator, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, stop_probability=0.99, residual_metric=None, loss='absolute_loss', random_state=0) elif model_name == 'adar': clf_weights = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1., loss='linear', random_state=None) else: # model_name == 'SGDR': clf_weights = SGDRegressor(loss="squared_loss", penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate="invscaling", eta0=0.01, power_t=0.25, warm_start=False, average=False, n_iter=None) # build clf_weights.fit(train_x, train_y) i += 1 if i % 20 == 0: mse = mean_squared_error(y_test, clf_weights.predict(X_test)) log.info("均方误差:{}".format(mse)) avgscores = cross_val_score(clf_weights, train_x, train_y).mean() log.info("{}/{} 训练集得分平均值: {}".format(e, i, avgscores)) model_path = os.path.join( "Order_predicts/datasets/results/models", '{}'.format(model_name)) if not os.path.exists(model_path): os.makedirs(model_path) joblib.dump( clf_weights, os.path.join(model_path, "{}_{}.model".format(e, i))) log.info(" Save ") if i % 50 == 0: scores = clf_weights.score(X_test, y_test) log.info("验证得分: {}".format(scores))
if x not in da2list: out.append(x) df = pd.Series(out) df.to_csv("word2vector_code/datasets/diff.csv", index=None) if __name__ == "__main__": import sys if len(sys.argv) < 2: raise Exception("[!] You should put more args") method = sys.argv[1] if method == 'cut': cut_data() log.info(" ! Build Success ! ") if method == 'train': import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) max_words = 10000 data_name = 'word2vector_code/datasets/train.csv' sentences = TextBatch(fname=data_name, max_sentence_length=max_words) model = word2vec.Word2Vec(sentences, iter=5, workers=4, size=1000, min_count=1, alpha=0.025,
def train(): epoch = 100 is_training = tf.placeholder(tf.bool, []) x = tf.placeholder(tf.float32, [None, image_size, image_size, 3]) downscaled = downscale(x) imitation, G_vars = generator(downscaled, is_training, False) real_output, _ = discriminator(x, is_training, False) fake_output, D_vars = discriminator(imitation, is_training, True) g_loss, d_loss = inference_losses(x, imitation, real_output, fake_output) run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True sess = tf.Session(config=run_config) with tf.variable_scope('srgan'): global_step = tf.Variable(0, name='global_step', trainable=False) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) g_train_op = opt.minimize(g_loss, global_step=global_step, var_list=G_vars) d_train_op = opt.minimize(d_loss, global_step=global_step, var_list=D_vars) init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state('./checkpoints/srgan/') if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) log.info("ckpt:{}".format(ckpt.model_checkpoint_path)) log.info("Model load success ... ") x_train = np.load('x_train.npy') total_batch_count = len(range(0, len(x_train) - batch_size + 1, batch_size)) log.info("total bc: {}".format(total_batch_count)) for i in range(epoch): bc = -1 for x_batch_images in minibatches(inputs=x_train, batch_size=batch_size): bc += 1 x_batch = normalize(x_batch_images) log.info("{}".format(x_batch.shape)) sess.run([g_train_op, d_train_op], feed_dict={ x: x_batch, is_training: True }) g, d = sess.run([g_loss, d_loss], feed_dict={ x: x_batch, is_training: True }) if bc % 50 == 0: log.info("epoch:{}, bc:{}, gloss:{}, dloss:{}".format( epoch, bc, g, d)) if bc % 500 == 0: model_name = "srgan" checkpoint_dir = './checkpoints/srgan/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) savename = os.path.join(checkpoint_dir, model_name) saver.save(sess, savename) log.info("Modle Save success :{} ".format(savename))
import time from glob import glob from six.moves import xrange import pprint import numpy as np import tensorflow as tf from pyduyp.utils.dl.ops.Convolution import conv2d as conv2d from pyduyp.utils.dl.ops.Convolution import deconv2d as deconv2d from pyduyp.utils.dl.ops.batchnorm import Contrib_batch_norm as batch_norm from pyduyp.utils.image_utils import save_images, get_image from pyduyp.utils.dl.ops.variable_ops import lrelu, conv_out_size_same, show_all_variables from pyduyp.utils.dl.ops.Linear import linear as linear from pyduyp.logger.log import log log.info("================= DCGAN Runing =================") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' flags = tf.app.flags flags.DEFINE_float("learning_rate", 0.0002, "Learning rate of for adam [0.0002]") flags.DEFINE_float("beta1", 0.5, "Momentum term of adam [0.5]") flags.DEFINE_integer("epoch", 15, "Epoch to train [15]") flags.DEFINE_integer("train_size", np.inf, "The size of train images [np.inf]") flags.DEFINE_integer("batch_size", 64, "The size of batch images [64]") flags.DEFINE_integer("sample_num", 64, "The size of batch images [64]") flags.DEFINE_integer( "input_height", 108,
def train_nnmodel(epoch, learning_rate, batch_size, data_path='datasets/results', data_name="train.csv", class_number=2, checkpoint_dir="datasets/results/models"): root = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(root, data_path, data_name) df_ohe = pd.read_csv(data_path) log.info("{}".format(df_ohe.shape)) df_ohe = shuffle(df_ohe) train_y = df_ohe['label'] train_y = pd.get_dummies(train_y) del df_ohe['label'] train_x = df_ohe x_data_holder = tf.placeholder(tf.float32, [None, train_x.shape[1]], name='inputs_x') y_data_holder = tf.placeholder(tf.float32, [None, class_number], name='inputs_y') y_prediction = neural_networks(x_data_holder, train_x.shape[1], class_number) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_data_holder, logits=y_prediction)) train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss) y_pre_max = tf.argmax(y_prediction, axis=1) # 预测值的最大值的索引 y_train_max = tf.argmax(y_data_holder, axis=1) # 真实值的最大值的索引 correct_prediction = tf.equal(y_pre_max, y_train_max) # 返回bool值 bool2float = tf.cast(correct_prediction, tf.float32) # bool转float32 accuracy = tf.reduce_mean(bool2float) # 准确率 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=5) for e in range(epoch): counter = 0 batch_count = len(train_x) // batch_size for batch_x, batch_y in minibatches(inputs=train_x, targets=train_y, batch_size=batch_size, shuffle=False): sess.run(train_step, feed_dict={ x_data_holder: batch_x, y_data_holder: batch_y }) train_loss = sess.run(loss, feed_dict={ x_data_holder: batch_x, y_data_holder: batch_y }) train_acc = sess.run(accuracy, feed_dict={ x_data_holder: batch_x, y_data_holder: batch_y }) if np.mod(counter, 10) == 1: log_out = "Epoch:{} Batch Count: {}/{}, Train Accuracy: {:06f}; Loss: {:06f}" log.info( log_out.format(e, counter, batch_count, train_acc, train_loss)) counter += 1 if np.mod(counter, 10) == 1: if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_name = os.path.join(root, checkpoint_dir) saver.save(sess, save_path=os.path.join( checkpoint_name, "{}.model".format(counter))) log.debug(" Model {} have save success ...".format( checkpoint_name))
import pandas as pd import os from tqdm import tqdm from collections import OrderedDict import numpy as np from pyduyp.utils.utils import time2day, time2mouth, time2week from pyduyp.utils.utils import compute_interval_of_day from pyduyp.utils.utils import get_freq_of_day_and_month, get_week_freq, get_type_freq from pyduyp.utils.utils import com_mode, pandas_quantile from pyduyp.logger.log import log from pyduyp.utils.large_datastruts import citysdict, continentdicts, countrydicts, provincedicts, agesdicts log.info("Start runing ...") def get_pos_action_by_id(step='train'): # 根据历史订单,获得精品订单的用户id,并在行为表中分离出其行为数据 pos_root = 'Order_predicts/datasets/results/{}/action_pos/'.format(step) if not os.path.exists(pos_root): os.makedirs(pos_root) action = pd.read_csv("Order_predicts/datasets/{}/action_{}.csv".format( step, step)) orderHistory = pd.read_csv( "Order_predicts/datasets/{}/orderHistory_{}.csv".format(step, step), usecols=['userid', 'orderTime', 'orderType']) data_pos = orderHistory[orderHistory['orderType'].isin( ['1'])]['userid'].values pos_ids = [] for posid in tqdm(data_pos): pos_features = {}
def get_action_features(step): pos_root = 'Order_predicts/datasets/results/{}/action_pos/'.format(step) neg_root = 'Order_predicts/datasets/results/{}/action_neg/'.format(step) if step == 'test': neg_root = 'Order_predicts/datasets/results/{}/action/'.format(step) if not os.path.exists(pos_root): os.makedirs(pos_root) if not os.path.exists(neg_root): os.makedirs(neg_root) for root in [pos_root, neg_root]: if 'pos' in root and step == 'test': continue base_name = root.split("/")[-2] history_path = "Order_predicts/datasets/results/{}/history_{}".format( step, base_name.split('_')[-1]) actions = [] for file in tqdm(os.listdir(root)): rows = OrderedDict() aid = file.split(".")[0] rows['id'] = aid if step == 'train': rows['label'] = 1 if base_name == 'action_pos' else 0 else: rows['label'] = 0 data = pd.read_csv(os.path.join(root, file)) data_types = data['actionType'].values.tolist() data_copy = data.copy() data_copy['time2days'] = data['actionTime'].apply(time2day) data_copy['time2mouth'] = data['actionTime'].apply(time2mouth) data_copy['time2week'] = data['actionTime'].apply(time2week) data_copy_grouped_day = data_copy.groupby(by='time2days') data_copy_grouped_month = data_copy.groupby(by='time2mouth') time_counts, two_interval = compute_interval_of_day(data_copy) quantiles = pandas_quantile(time_counts) if len(time_counts) > 4: last_times = time_counts[-4:-1] else: last_times = [0, 0, 0] type_freq, types_sum = get_type_freq(data) # 每个操作的总数, 总次数 if os.path.isfile(os.path.join(history_path, file)): history = pd.read_csv(os.path.join(history_path, file)) historydata_copy = history.copy() historydata_copy['time2days'] = data['actionTime'].apply( time2day) historyinterval, two_historyinterval = compute_interval_of_day( historydata_copy) if len(historyinterval) > 1: historyinterval = historyinterval else: historyinterval = [0, 0, 0, 0, 0] else: historyinterval = [0, 0, 0, 0, 0] rows['2_t1'] = type_freq['2_t1'] # 类型1-9点击总数 rows['3_t2'] = type_freq['3_t2'] rows['4_t3'] = type_freq['4_t3'] rows['5_t4'] = type_freq['5_t4'] rows['6_t5'] = type_freq['6_t5'] rows['7_t6'] = type_freq['7_t6'] rows['8_t7'] = type_freq['8_t7'] rows['9_t8'] = type_freq['9_t8'] rows['10_t9'] = type_freq['10_t9'] rows['11_rate1'] = type_freq['2_t1'] / types_sum # 打开app的比例 rows['12_rate9'] = type_freq['10_t9'] / types_sum # 下单的比例 rows['13_atmean'] = np.mean(time_counts) # 时间均值 rows['14_atstd'] = np.std(time_counts) # 时间标准差 rows['15_atmedian'] = np.median(time_counts) # 时间中位数 rows['16_tmode'] = com_mode(time_counts) # 时间众数 rows['17_atptp'] = np.max(time_counts) - np.min( time_counts) if len(time_counts) > 0 else 0 # 时间极差 rows['18_atvar'] = np.var(time_counts) # 时间方差 rows['19_xishu'] = np.mean(time_counts) / np.std( time_counts) if len(time_counts) > 1 else 0 # 时间变异系数 rows['20_lastmean'] = np.mean(last_times) # 最后三天间隔的均值 rows['21_laststd'] = np.std(last_times) # 最后三天间隔的标准差 rows['22_dayrate'] = get_freq_of_day_and_month( data_copy_grouped_day) # 日均 rows['23_monthrate'] = get_freq_of_day_and_month( data_copy_grouped_month) # 月均 rows['24_weekrate'] = get_week_freq( data_copy['time2week'].values) # 周均 rows['25_atmean'] = np.mean(data_types) # 类型均值 rows['26_atstd'] = np.std((data_types)) # 类型标准差 rows['27_atmedian'] = np.median(data_types) # 类型中位数 rows['28_tmode'] = com_mode(data_types) # 类型众数 rows['29_atptp'] = np.max(data_types) - np.min(data_types) if len( data_types) > 0 else 0 # 类型极差 rows['30_atvar'] = np.var(data_types) # 类型方差 rows['31_xishu'] = np.mean(data_types) / np.std(data_types) if len( data_types) > 1 else 0 # 类型变异系数 rows['32_rate2'] = type_freq['3_t2'] / types_sum # 2的比例 rows['33_rate3'] = type_freq['4_t3'] / types_sum # 3的比例 rows['34_rate4'] = type_freq['5_t4'] / types_sum # 4的比例 rows['35_rate5'] = type_freq['6_t5'] / types_sum # 5的比例 rows['36_rate6'] = type_freq['7_t6'] / types_sum # 6的比例 rows['37_rate7'] = type_freq['8_t7'] / types_sum # 7的比例 rows['38_rate8'] = type_freq['9_t8'] / types_sum # 8的比例 rows['39_htptp'] = np.max(historyinterval) - np.min( historyinterval) # 历史订单时间极差 rows['40_atmean'] = np.mean(two_interval) # 时间均值 rows['41_atstd'] = np.std(two_interval) # 时间标准差 rows['42_atmedian'] = np.median(two_interval) # 时间中位数 rows['43_tmode'] = com_mode(two_interval) # 时间众数 rows['44_atptp'] = np.max(two_interval) - np.min( two_interval) if len(two_interval) > 0 else 0 # 时间极差 rows['45_atvar'] = np.var(two_interval) # 时间方差 rows['46_xishu'] = np.mean(two_interval) / np.std( two_interval) if len(two_interval) > 1 else 0 # 时间变异系数 rows['47_quantile2'] = quantiles[0] rows['48_quantile4'] = quantiles[1] actions.append(rows) df = pd.DataFrame(actions) df = df.replace(np.inf, 100) df = df.round(7) df = df.round({'label': 0, 'id': 0}) save_name = "Order_predicts/datasets/results/{}/{}_features.csv".format( step, base_name) if step == 'test': del df['label'] df.to_csv(save_name, index=None) log.info(" !!! {}".format(save_name))