def crawl_avatar(zhihu_user_token_name, avatar_url):
    """
    crawl zhihu avatar
    :param zhihu_user_token_name:
    :param avatar_url:
    :return:
    """

    print('start crawling avatar of %s ...' % str(zhihu_user_token_name))
    response = requests.get(avatar_url)
    zhihu_util.mkdirs_if_not_exist('./result/avatar/')
    if response.status_code == 200:
        with open('./result/avatar/%s.jpg' % str(zhihu_user_token_name),
                  mode='wb') as f:
            f.write(response.content)
            f.flush()
            f.close()
def train_and_test_xgboost(train, test, train_Y, test_Y):
    """
    train and test XGBoost
    :param train:
    :param test:
    :param train_Y:
    :param test_Y:
    :return:
    """
    mkdirs_if_not_exist('./xgb')

    zhihu_live_train = []
    for i in range(len(train_Y)):
        zhihu_live_train.append(
            "%f %s" % (np.array(train_Y).ravel().tolist()[i], " ".join(
                train[i].tolist())))

    with open('./xgb/ZhihuLive.txt.train', mode='wt', encoding='utf-8') as f:
        f.write("\r\n".join(zhihu_live_train))

    zhihu_live_test = []
    for i in range(len(test_Y)):
        zhihu_live_test.append(
            str(np.array(test_Y).ravel().tolist()[i]) + " " +
            " ".join(test[i]))

    with open('./xgb/ZhihuLive.txt.test', mode='wt', encoding='utf-8') as f:
        f.write("\r\n".join(zhihu_live_test))

    dtrain = xgb.DMatrix('./xgb/ZhihuLive.txt.train')
    dtest = xgb.DMatrix('./xgb/ZhihuLive.txt.test')
    # specify parameters via map
    param = {
        'max_depth': 2,
        'eta': 1,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 2
    bst = xgb.train(param, dtrain, num_round)
    # make prediction
    preds = bst.predict(dtest)
def train_and_test_model(train, test, train_Y, test_Y):
    """
    train and test mainstream ML regressors
    :param train:
    :param test:
    :param train_Y:
    :param test_Y:
    :return:
    """
    # model = Pipeline([('poly', PolynomialFeatures(degree=3)),
    #                   ('linear', LinearRegression(fit_intercept=False))])

    # model = LinearRegression()
    # model = LassoCV(alphas=[_ * 0.1 for _ in range(1, 1000, 1)])
    # model = RidgeCV(alphas=[_ * 0.1 for _ in range(1, 1000, 1)])
    # model = RandomForestRegressor()
    model = SVR(kernel='rbf')
    # model = SVR(kernel='linear', C=1e3)
    # model = SVR(kernel='poly', C=1e3, degree=2)
    # model = KNeighborsRegressor(n_neighbors=10, n_jobs=4)

    # model = MLPRegressor(hidden_layer_sizes=(16, 8, 8, 4), early_stopping=True, alpha=1e-4,
    #                      batch_size=16, learning_rate='adaptive')
    model.fit(train, train_Y.values.ravel())
    mkdirs_if_not_exist('./model')
    joblib.dump(model, './model/svr.pkl')
    predicted_score = model.predict(test)
    mae_lr = round(mean_absolute_error(test_Y, predicted_score), 4)
    rmse_lr = round(np.math.sqrt(mean_squared_error(test_Y, predicted_score)),
                    4)
    print('===============The Mean Absolute Error is {0}===================='.
          format(mae_lr))
    print(
        '===============The Root Mean Square Error is {0}===================='.
        format(rmse_lr))

    from util.zhihu_util import out_result
    out_result(predicted_score, test_Y)
Example #4
0
    # fast_text_classifier.train_and_eval()

    # fast_text_classifier.train_word_repr()
    # print(fast_text_classifier.get_word_repr("知乎"))

    texts, rates = read_corpus()

    print("There are {0} records in total...".format(len(rates)))
    X, y = get_fast_text_repr(fasttext.load_model('fastTextRepr.bin'), texts,
                              rates)

    print(X.shape)

    print('start training classifier...')
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    rf.fit(X_train, y_train)
    mkdirs_if_not_exist('./model')
    joblib.dump(rf, './model/rf.pkl')

    y_pred = rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    print('finish training classifier...')
def train_and_test_mtnet(train, test, train_Y, test_Y, epoch):
    """
    train and test with MTNet
    :param train:
    :param test:
    :param train_Y:
    :param test_Y:
    :return:
    """
    trainloader = torch.utils.data.DataLoader(ZhihuLiveDataset(train, train_Y),
                                              batch_size=cfg['batch_size'],
                                              shuffle=True,
                                              num_workers=4)
    testloader = torch.utils.data.DataLoader(ZhihuLiveDataset(test, test_Y),
                                             batch_size=cfg['batch_size'],
                                             shuffle=False,
                                             num_workers=4)

    mtnet = MTNet()

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        mtnet = nn.DataParallel(mtnet)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(mtnet.parameters(),
                           lr=cfg['init_lr'],
                           weight_decay=cfg['weight_decay'])
    # learning_rate_scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg['lr_decay_step'], gamma=0.1)

    for epoch in range(epoch):
        # learning_rate_scheduler.step()

        running_loss = 0.0
        for i, data_batch in enumerate(trainloader, 0):
            inputs, labels = data_batch['data'], data_batch['label']

            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE)
            mtnet = mtnet.to(DEVICE)

            optimizer.zero_grad()

            outputs = mtnet.forward(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 10 == 0:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training\n')
    print('save trained model...')
    model_path_dir = './model'
    if not os.path.isdir(model_path_dir) or not os.path.exists(model_path_dir):
        os.makedirs(model_path_dir)
    torch.save(
        mtnet.state_dict(),
        os.path.join(model_path_dir,
                     'ZhihuLive_{0}.pth'.format(mtnet.__class__.__name__)))

    mtnet.eval()
    predicted_labels = []
    gt_labels = []
    for data_batch in testloader:
        inputs, labels = data_batch['data'], data_batch['label']
        inputs = inputs.to(DEVICE)
        mtnet = mtnet.to(DEVICE)

        outputs = mtnet.forward(inputs)
        predicted_labels += outputs.to("cpu").data.numpy().tolist()
        gt_labels += labels.numpy().tolist()

    mae_lr = round(
        mean_absolute_error(np.array(gt_labels), np.array(predicted_labels)),
        4)
    rmse_lr = round(
        np.math.sqrt(
            mean_squared_error(np.array(gt_labels),
                               np.array(predicted_labels))), 4)
    print(
        '===============The Mean Absolute Error of MTNet is {0}===================='
        .format(mae_lr))
    print(
        '===============The Root Mean Square Error of MTNet is {0}===================='
        .format(rmse_lr))

    mkdirs_if_not_exist('./result')
    col = ['gt', 'pred']
    df = pd.DataFrame([[gt_labels[i][0], predicted_labels[i][0]]
                       for i in range(len(predicted_labels))],
                      columns=col)
    df.to_csv("./result/output-%s.csv" % mtnet.__class__.__name__, index=False)
def crawl_zhihu_followee(follow_base='xulu-0620', followee_num=100):
    """
    crawl followee of a zhihu user
    :param follow_base:
    :param followee_num:
    :return:
    """
    headers = {
        'accept': 'application/json, text/plain, */*',
        'DNT': '1',
        'Host': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com/people/excited-vczh/following',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
        'X-UDID': 'AHCvxRvVYw2PTphuWeHxakvGHH0KDysWXA8='
    }

    followee_list = []

    cookies = dict(cookies_are='')

    for offset in range(0, followee_num, 20):
        payload = {
            'include':
            'data[*].answer_count, articles_count, gender, follower_count, is_followed, is_following,badge[?(type=best_answerer)].topics',
            'offset': str(offset),
            'limit': '20'
        }

        time.sleep(2)
        print('start crawling page %d' % int(offset / 20))
        response = requests.get(
            'https://www.zhihu.com/api/v4/members/%s/followees' %
            str(follow_base),
            params=payload,
            headers=headers,
            cookies=cookies)

        if response.status_code == 200:
            result = response.json()
            if len(result['data']) > 0:
                for fle in result['data']:
                    url_token = fle['url_token']
                    answer_count = fle['answer_count']
                    articles_count = fle['articles_count']
                    avatar_url = fle['avatar_url'].replace(
                        '_is.jpg', '_xll.jpg')
                    badge = str(fle['badge'])
                    follower_count = fle['follower_count']
                    gender = fle['gender']
                    name = fle['name']

                    followee_list.append([
                        url_token, name, gender, answer_count, articles_count,
                        badge, follower_count, avatar_url
                    ])

                    col = [
                        'url_token', 'name', 'gender', 'answer_count',
                        'articles_count', 'badge', 'follower_count',
                        'avatar_url'
                    ]
                    try:
                        crawl_avatar(url_token, avatar_url)
                    except:
                        pass

                    df = pd.DataFrame(followee_list, columns=col)
                    zhihu_util.mkdirs_if_not_exist('./result/')
                    df.to_excel('./result/' + str(follow_base) + ".xlsx",
                                sheet_name=str(follow_base),
                                index=False)
            else:
                print('All followees have been crawled~~')
        else:
            print('No access!!')