def __init__(self): self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84' self.start_url = 'https://zhuanlan.zhihu.com/yinjiaoshou886/answer' self.browser = webdriver.Chrome( executable_path='/home/caidong/developProgram/selenium/chromedriver' ) self.SqlH = SqlHelper() self.SqlH.init_db('zhihu', 'zhihu_all') self.base_url = 'https://www.zhihu.com' self.user_home_url = '' self.current = 1
def __init__(self): self.start_url = 'https://www.zhihu.com/people/kaifulee/activities' self.base_url = 'https://www.zhihu.com' self.type = [ 'hot', 'local', 'shehui', 'guonei', 'guoji', 'recomment', 'junshi', 'finance', 'technology', 'sports', 'fashionbang', 'fashionbang', 'auto_moto', 'fangcan', 'technology', 'yangshengtang' ] self.SqlH = SqlHelper() self.SqlH.init_db('zhihu') self.page = 2 self.totla_url_set = set() self.wait_use_url_set = set() self.current_type = ''
def __init__(self): self.black_page = 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84' self.start_url = 'https://www.zhihu.com/people/kaifulee/followers?page=25583' #self.start_url = 'https://www.zhihu.com/people/ji-da-fa-37/activities' self.base_url = 'https://www.zhihu.com' self.SqlH = SqlHelper() self.SqlH.init_db('zhihu','zhihu_48000') #self.browser = webdriver.PhantomJS() # proxy = {'address': '60.168.104.30:3128', # 'username': '******', # 'password': '******' # } # capabilities = dict(DesiredCapabilities.CHROME) # capabilities['proxy'] = {'proxyType': 'MANUAL', # 'httpProxy': proxy['address'], # 'ftpProxy': proxy['address'], # 'sslProxy': proxy['address'], # 'noProxy': '', # 'class': "org.openqa.selenium.Proxy", # 'autodetect': False} # # capabilities['proxy']['httpUsername'] = proxy['username'] # capabilities['proxy']['httpPassword'] = proxy['password'] # chromeOptions = webdriver.ChromeOptions() # chromeOptions.add_argument('--proxy-server=http://60.168.104.30:3128') #self.browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path='/home/caidong/developProgram/selenium/chromedriver') #self.browser = webdriver.PhantomJS() #cookies = ZhihuLogin().login() #print(cookies) self.browser = webdriver.PhantomJS() self.browser = webdriver.Chrome(executable_path='/home/caidong/developProgram/selenium/chromedriver') #for cookie in cookies: # self.browser.add_cookie({cookie['name']:cookie['value']}) #self.browser.add_cookie(cookie) time.sleep(5) print('cookie',self.browser.get_cookies()) #print(self.browser.get_cookies()) #self.browser.add_cookie({"cookie":'_zap=b24c85f0-aae0-456a-ba87-e0919de79409; __utma=243313742.618834370.1505397831.1505397831.1505431589.2; __utmz=243313742.1505397831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); d_c0="AJCCExnEYAyPTuiuB47mCQN_anS_LW2ZmQI=|1505432287"; q_c1=f92e81f1440d49eca643b9bd71df1d06|1505471670000|1502586350000; aliyungf_tc=AQAAABpahiv+pQIA4wmi0wpuOA0ptCdt; __utma=51854390.226003310.1505817316.1505817316.1505817316.1; __utmc=51854390; __utmz=51854390.1505817316.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.000--|3=entry_date=20170813=1; XSRF-TOKEN=2|02bd5b9f|30893afa3ad96af92f8d3ffb67906faa338d76fe308d3fb267de6cad358569a837dc39ae|1505824255; _xsrf=24ae8d1f-0dde-4510-a20d-ec7278275ab1; l_cap_id="NDYzOWZmNjBmZDhjNDBkZWI5MDg0NjYyZDk4YTk2OTA=|1505824625|220e4527cbfe214589599d071685e4c7f62143fc"; r_cap_id="NWJhOTRmYzg2NTVlNDczY2ExZWY3YzgxNGQ2ZmRmM2I=|1505824625|b050327da2a8dedc37a8e744640b60b553f3b771"; cap_id="YjcyNGZkYjFlY2JkNDU3ZWFlYmQ0NjQ3ZDJmNDcwZjk=|1505824625|5804f3f4999cf311334c3664f2e41ad2d4d93029'}) self.start_page = 48000 self.end_page = 47000
def __init__(self): self.SqlH = SqlHelper() self.SqlH.init_db('zhihu') self.base_url = 'https://www.zhihu.com'
def __init__(self): self.SqlH = SqlHelper() self.SqlH.init_db('baiduNews')
def __init__(self): self.sqlhelper = SqlHelper()
return results print(items) return items def close_client(self): self.client.close() def count(self, condition=None): condition = dict(condition) return self.collection.find(condition).count() if __name__ == '__main__': from MongoHelp import MongoHelper as SqlHelper sqlhelper = SqlHelper() sqlhelper.init_db('zhihu', 'zhihu_all') pre = sqlhelper.count({}) print('sum:', str(sqlhelper.count({}))) time.sleep(10) now = sqlhelper.count({}) # url = sqlhelper.select_home_url({"$and":[{"special_url":{"$exists":True}},{"special_url":{"$ne":"none"}}]},count=100,page=1) # print("content",url) # for item in url: # print(item) ##### # url = sqlhelper.select_home_url({"special_name":{"$exists":True}},count=100,page=1) # for item in url: # print(item) #
def __init__(self): self.type=['hot','local','shehui','guonei','guoji','recomment','junshi','finance','technology','sports','fashionbang','fashionbang','auto_moto','fangcan','technology','yangshengtang'] self.SqlH= SqlHelper() self.SqlH.init_db('weixin') self.page=2 self.current_type=''
from MongoHelp import MongoHelper as SqlHelper import csv, time SqlH = SqlHelper() SqlH.init_db('zhiHu', 'zhihu_all') headers = [ 'user_name', 'answer_comment_1', 'answer_comment_2', 'answer_comment_3', 'article_comment_1', 'article_comment_2', 'article_comment_3', 'answer', 'user_home_url', 'article', 'flowing', 'followers', 'collect', 'answer', 'article' ] con = { "$and": [ { 'article_comment': { "$exists": True } }, { 'answer_comment': { "$exists": True } }, { 'flowing': { "$exists": True } }, # {'export_flag': {"$exists": False}} ]