def main(use_proxies=False): conf, engine = Connect('conf.yaml') # 获取配置文件的内容 uids = conf.get('uids') cookies = conf.get('cookies') user_agents = conf.get('user_agents') uids = list(uids.values()) cookies = list(cookies.values()) user_agents = list(user_agents.values()) conn = engine.connect() metadata = MetaData(engine) wb_user = Table('wb_user', metadata, autoload=True) # Table Reflection 个人信息表 wb_data = Table('wb_data', metadata, autoload=True) # 动态表 for uid in uids: # 随机选择,防止被ban cookie = random.choice(cookies) cookie = getcookies(cookie) headers = {'User_Agent': random.choice(user_agents)} infourl = 'https://weibo.cn/' + str(uid) + '/info' #资料页面 mainurl = 'https://weibo.cn/' + str(uid) #动态页面 resinfo = gethtml(infourl, headers, cookie, conf, use_proxies) # 抓取资料页的信息 resmain = gethtml(mainurl, headers, cookie, conf, use_proxies) # 抓取用户主页信息 getinfo(resinfo, uid, wb_user, conn) getmain(resmain, uid, wb_data, conn, mainurl, user_agents, cookies, conf, use_proxies) conn.close()
def main(): conf, engine = Connect('conf.yaml') # 获取配置文件的内容 uids = conf.get('uids') uids = list(uids.values()) cookies = pickle.load(open('cookies.pkl', 'rb')) conn = engine.connect() metadata = MetaData(engine) wb_user = Table('wb_user', metadata, autoload=True) # Table Reflection 个人信息表 wb_data = Table('wb_data', metadata, autoload=True) # 动态表 for uid in uids: getmain(cookies, uid, conn, wb_data, wb_user) conn.close()
def get(): conf, engine = Connect('conf.yaml') # 获取配置文件的内容 loginname = conf.get('loginname') password = conf.get('password') loginname = list(loginname.values()) password = list(password.values()) with open('cookies.pkl', 'wb') as f: for i in range(len(password)): # 将每个账号的cookies保存下来. try: driver = webdriver.Chrome() driver.set_window_size( 1124, 850) # 防止得到的WebElement的状态is_displayed为False,即不可见 driver.get("http://www.weibo.com/login.php") time.sleep(5) driver.find_element_by_xpath('//*[@id="loginname"]').clear() driver.find_element_by_xpath('//*[@id="loginname"]').send_keys( loginname[i]) driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input' ).clear() time.sleep(2) driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input' ).send_keys(password[i]) driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click() driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input' ).send_keys(input("输入验证码: ")) time.sleep(1) driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click() except Exception as e: print("验证码输入错误,请重新输入!") driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[3]/div/input' ).send_keys(input("输入验证码: ")) time.sleep(1) driver.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click() cookies = driver.get_cookies() print(cookies) pickle.dump(cookies, f)
def get_time_str(uid): _, engine = Connect('../conf.yaml') # 连接数据库 conn = engine.connect() metadata = MetaData(engine) wb_data = Table('wb_data', metadata, autoload=True) s = select([wb_data]).where(wb_data.c.uid == uid) res = conn.execute(s) conn.close() str = '' time_lists = [] for row in res: str += row[2] + '\n' time_lists.append(row[3]) return time_lists, str
def DeleteUsers(): conf, engine = Connect('conf.yaml') conn = engine.connect() metadata = MetaData(engine) wb_data = Table('wb_data', metadata, autoload=True) wb_user = Table('wb_user', metadata, autoload=True) wb_topic = Table('wb_topic', metadata, autoload=True) empty = select([wb_user.c.uid]) res = conn.execute(empty) #得到WBUser表中所有的uid deluid = [] #要删除的uid uids = conf.get('uids') uids = list(uids.values()) #得到配置文件中的uid for r in res: if (int(r[0]) not in uids): deluid.append(r[0]) for uid in deluid: exc = wb_data.delete().where(wb_user.c.uid == str(uid)) #删除用户动态信息 conn.execute(exc) exc = wb_topic.delete().where(wb_topic.c.uid == str(uid)) #删除用户主题 conn.execute(exc) exc = wb_user.delete().where(wb_user.c.uid == str(uid)) #删除用户个人信息 conn.execute(exc) conn.close()
def Save_Topic_Words(model,feature_names, uid,n_top_words=20): _,engine=Connect('../conf.yaml') conn = engine.connect() metadata = MetaData(engine) wb_topic = Table('wb_topic', metadata, autoload=True) for topic_idx, topic in enumerate(model.components_): topics=topic_idx #主题 topic_conts=([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])#主题 print("Topic #%d:" % topics) print(topic_conts) for topic_cont in topic_conts: ins = insert(wb_topic).values(uid=uid,topic=topics,topic_cont=topic_cont) ins = ins.on_duplicate_key_update( topic=topics ) conn.execute(ins) conn.close()
lda = LatentDirichletAllocation(n_components=topics,#主题数 learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调 ) #用变分贝叶斯方法训练模型 lda.fit(tf) #依次输出每个主题的关键词表 tf_feature_names = tf_vectorizer.get_feature_names() return lda,tf,tf_feature_names,tf_vectorizer #将主题以可视化结果展现出来 def pyLDAvisUI(lda,tf,tf_vectorizer): page = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) pyLDAvis.save_html(page, 'lda.html') #将主题可视化数据保存为html文件 #pyLDAvis.save_json(page,'lda.json') #将主题可视化数据保存为json文件 def main(uid): wordlists, uid = getwords(uid) lda, tf, tf_feature_names, tf_vectorizer = word2vec(wordlists) Save_Topic_Words(lda, tf_feature_names, uid) pyLDAvisUI(lda, tf, tf_vectorizer) if __name__ == '__main__': conf, _ = Connect('../conf.yaml') uid = conf.get('uids') uid = list(uid.values())[0] main(uid) # 指定需要分析的用户的uid(必须先存在conf.yaml里面,并且运行了一次sina_spider程序),默认为conf.yaml中的第一条uid