def crawl_avatar(zhihu_user_token_name, avatar_url): """ crawl zhihu avatar :param zhihu_user_token_name: :param avatar_url: :return: """ print('start crawling avatar of %s ...' % str(zhihu_user_token_name)) response = requests.get(avatar_url) zhihu_util.mkdirs_if_not_exist('./result/avatar/') if response.status_code == 200: with open('./result/avatar/%s.jpg' % str(zhihu_user_token_name), mode='wb') as f: f.write(response.content) f.flush() f.close()
def train_and_test_xgboost(train, test, train_Y, test_Y): """ train and test XGBoost :param train: :param test: :param train_Y: :param test_Y: :return: """ mkdirs_if_not_exist('./xgb') zhihu_live_train = [] for i in range(len(train_Y)): zhihu_live_train.append( "%f %s" % (np.array(train_Y).ravel().tolist()[i], " ".join( train[i].tolist()))) with open('./xgb/ZhihuLive.txt.train', mode='wt', encoding='utf-8') as f: f.write("\r\n".join(zhihu_live_train)) zhihu_live_test = [] for i in range(len(test_Y)): zhihu_live_test.append( str(np.array(test_Y).ravel().tolist()[i]) + " " + " ".join(test[i])) with open('./xgb/ZhihuLive.txt.test', mode='wt', encoding='utf-8') as f: f.write("\r\n".join(zhihu_live_test)) dtrain = xgb.DMatrix('./xgb/ZhihuLive.txt.train') dtest = xgb.DMatrix('./xgb/ZhihuLive.txt.test') # specify parameters via map param = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } num_round = 2 bst = xgb.train(param, dtrain, num_round) # make prediction preds = bst.predict(dtest)
def train_and_test_model(train, test, train_Y, test_Y): """ train and test mainstream ML regressors :param train: :param test: :param train_Y: :param test_Y: :return: """ # model = Pipeline([('poly', PolynomialFeatures(degree=3)), # ('linear', LinearRegression(fit_intercept=False))]) # model = LinearRegression() # model = LassoCV(alphas=[_ * 0.1 for _ in range(1, 1000, 1)]) # model = RidgeCV(alphas=[_ * 0.1 for _ in range(1, 1000, 1)]) # model = RandomForestRegressor() model = SVR(kernel='rbf') # model = SVR(kernel='linear', C=1e3) # model = SVR(kernel='poly', C=1e3, degree=2) # model = KNeighborsRegressor(n_neighbors=10, n_jobs=4) # model = MLPRegressor(hidden_layer_sizes=(16, 8, 8, 4), early_stopping=True, alpha=1e-4, # batch_size=16, learning_rate='adaptive') model.fit(train, train_Y.values.ravel()) mkdirs_if_not_exist('./model') joblib.dump(model, './model/svr.pkl') predicted_score = model.predict(test) mae_lr = round(mean_absolute_error(test_Y, predicted_score), 4) rmse_lr = round(np.math.sqrt(mean_squared_error(test_Y, predicted_score)), 4) print('===============The Mean Absolute Error is {0}===================='. format(mae_lr)) print( '===============The Root Mean Square Error is {0}===================='. format(rmse_lr)) from util.zhihu_util import out_result out_result(predicted_score, test_Y)
# fast_text_classifier.train_and_eval() # fast_text_classifier.train_word_repr() # print(fast_text_classifier.get_word_repr("知乎")) texts, rates = read_corpus() print("There are {0} records in total...".format(len(rates))) X, y = get_fast_text_repr(fasttext.load_model('fastTextRepr.bin'), texts, rates) print(X.shape) print('start training classifier...') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) rf.fit(X_train, y_train) mkdirs_if_not_exist('./model') joblib.dump(rf, './model/rf.pkl') y_pred = rf.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) print('finish training classifier...')
def train_and_test_mtnet(train, test, train_Y, test_Y, epoch): """ train and test with MTNet :param train: :param test: :param train_Y: :param test_Y: :return: """ trainloader = torch.utils.data.DataLoader(ZhihuLiveDataset(train, train_Y), batch_size=cfg['batch_size'], shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(ZhihuLiveDataset(test, test_Y), batch_size=cfg['batch_size'], shuffle=False, num_workers=4) mtnet = MTNet() if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") mtnet = nn.DataParallel(mtnet) criterion = nn.MSELoss() optimizer = optim.Adam(mtnet.parameters(), lr=cfg['init_lr'], weight_decay=cfg['weight_decay']) # learning_rate_scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg['lr_decay_step'], gamma=0.1) for epoch in range(epoch): # learning_rate_scheduler.step() running_loss = 0.0 for i, data_batch in enumerate(trainloader, 0): inputs, labels = data_batch['data'], data_batch['label'] inputs = inputs.to(DEVICE) labels = labels.to(DEVICE) mtnet = mtnet.to(DEVICE) optimizer.zero_grad() outputs = mtnet.forward(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if i % 10 == 0: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) running_loss = 0.0 print('Finished Training\n') print('save trained model...') model_path_dir = './model' if not os.path.isdir(model_path_dir) or not os.path.exists(model_path_dir): os.makedirs(model_path_dir) torch.save( mtnet.state_dict(), os.path.join(model_path_dir, 'ZhihuLive_{0}.pth'.format(mtnet.__class__.__name__))) mtnet.eval() predicted_labels = [] gt_labels = [] for data_batch in testloader: inputs, labels = data_batch['data'], data_batch['label'] inputs = inputs.to(DEVICE) mtnet = mtnet.to(DEVICE) outputs = mtnet.forward(inputs) predicted_labels += outputs.to("cpu").data.numpy().tolist() gt_labels += labels.numpy().tolist() mae_lr = round( mean_absolute_error(np.array(gt_labels), np.array(predicted_labels)), 4) rmse_lr = round( np.math.sqrt( mean_squared_error(np.array(gt_labels), np.array(predicted_labels))), 4) print( '===============The Mean Absolute Error of MTNet is {0}====================' .format(mae_lr)) print( '===============The Root Mean Square Error of MTNet is {0}====================' .format(rmse_lr)) mkdirs_if_not_exist('./result') col = ['gt', 'pred'] df = pd.DataFrame([[gt_labels[i][0], predicted_labels[i][0]] for i in range(len(predicted_labels))], columns=col) df.to_csv("./result/output-%s.csv" % mtnet.__class__.__name__, index=False)
def crawl_zhihu_followee(follow_base='xulu-0620', followee_num=100): """ crawl followee of a zhihu user :param follow_base: :param followee_num: :return: """ headers = { 'accept': 'application/json, text/plain, */*', 'DNT': '1', 'Host': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com/people/excited-vczh/following', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36', 'X-UDID': 'AHCvxRvVYw2PTphuWeHxakvGHH0KDysWXA8=' } followee_list = [] cookies = dict(cookies_are='') for offset in range(0, followee_num, 20): payload = { 'include': 'data[*].answer_count, articles_count, gender, follower_count, is_followed, is_following,badge[?(type=best_answerer)].topics', 'offset': str(offset), 'limit': '20' } time.sleep(2) print('start crawling page %d' % int(offset / 20)) response = requests.get( 'https://www.zhihu.com/api/v4/members/%s/followees' % str(follow_base), params=payload, headers=headers, cookies=cookies) if response.status_code == 200: result = response.json() if len(result['data']) > 0: for fle in result['data']: url_token = fle['url_token'] answer_count = fle['answer_count'] articles_count = fle['articles_count'] avatar_url = fle['avatar_url'].replace( '_is.jpg', '_xll.jpg') badge = str(fle['badge']) follower_count = fle['follower_count'] gender = fle['gender'] name = fle['name'] followee_list.append([ url_token, name, gender, answer_count, articles_count, badge, follower_count, avatar_url ]) col = [ 'url_token', 'name', 'gender', 'answer_count', 'articles_count', 'badge', 'follower_count', 'avatar_url' ] try: crawl_avatar(url_token, avatar_url) except: pass df = pd.DataFrame(followee_list, columns=col) zhihu_util.mkdirs_if_not_exist('./result/') df.to_excel('./result/' + str(follow_base) + ".xlsx", sheet_name=str(follow_base), index=False) else: print('All followees have been crawled~~') else: print('No access!!')