def getSeedUserList(): dbo = dbOperP.dbOperator() results = dbo.selectData("select id, uName, Wid from seed_users") dbo.closeDb() return results
def extractTopic(self, searchStr, pid, unicodePage, firstPage): #抓取每一条微博 soup = BeautifulSoup(unicodePage) items = soup.findAll("div", {"class": "c"}) dbo = dbOperP.dbOperator() total = 0 removeFirst = False for item in items: idstr = item.get("id", None) if idstr: #weibo id 'M_AeJs808NB' if firstPage and removeFirst == False: removeFirst = True continue seleResult = dbo.selectData( "select pid from data where wbid = \'%s\'" % idstr) if len(seleResult) > 0: opid = seleResult[0][0] if opid == pid: total += 1 continue else: total = 0 if total >= 3: dbo.closeDb() return False '''item weibo text div''' #get retweet contentStr1 = item.renderContents() i = contentStr1.find(r'>转发[') if i > 0: retweetNum = int(contentStr1[i + 8]) if retweetNum > 0: hr = contentStr1[:i].split(" ")[-1] rturl = hr.split(r'"')[1] if rturl.find('cmt') < 0: rturl = rturl.replace('amp;', '') page = 1 go = True while (go): url = rturl + '&page=' + str(page) go = self.getretweet(url, idstr) page += 1 '''get time begin''' subs = item.findAll("span", {"class": "ct"}) for t in subs[0].findAll(True): t.hidden = True substr = subs[0].renderContents() substr = substr.strip().split(" ")[0] timeStr = '' if substr.find('分钟前') > -1: timeStr = time.strftime("%Y-%m-%d %H:%M:00", time.localtime(time.time())) elif substr.find('今天') > -1: timeStr = time.strftime( "%Y-%m-%d ", time.localtime( time.time())) + substr.split(" ")[1] elif substr.find('月') > -1: dateStr = filter(str.isdigit, substr.split(" ")[0]) timeStr = time.strftime("%Y-", time.localtime( time.time())) + dateStr[0:2] + "-" + dateStr[ 2:] + " " + substr.split(" ")[1] + ":00" else: timeStr = substr '''get time end''' for sub in subs: sub.extract() for tag in item.findAll(True): tag.hidden = True contentStr = item.renderContents() if contentStr.find("转发了") >= 0: continue contentStr = contentStr.replace(" ", "") contentStr = contentStr.replace("<!-- -->", "") result, number = re.subn('赞.*收藏', "", contentStr) del number splitIndex = result.find(':') userName = result[0:splitIndex] content = result[splitIndex + 1:] value = [None, pid, idstr, userName, content, timeStr] try: dbo.insert(value) except: pass dbo.closeDb() return True
def getSearchList(self): #得到搜索内容list 你需要根据需要自己重写此方法 dbo = dbOperP.dbOperator() results = dbo.selectData("select name,pid from peopleinfor") dbo.closeDb() return results
def getSearchList(self):#得到搜索内容list 你需要根据需要自己重写此方法 dbo = dbOperP.dbOperator() results = dbo.selectData("select name,pid from peopleinfor") dbo.closeDb() return results
def extractTopic(self,searchStr,pid,unicodePage,firstPage):#抓取每一条微博 soup = BeautifulSoup(unicodePage) items = soup.findAll("div",{"class":"c"}) dbo = dbOperP.dbOperator() total = 0 removeFirst = False for item in items: idstr = item.get("id",None) if idstr:#weibo id 'M_AeJs808NB' if firstPage and removeFirst == False: removeFirst = True continue seleResult = dbo.selectData("select pid from data where wbid = \'%s\'" % idstr) if len(seleResult) > 0: opid = seleResult[0][0] if opid == pid: total += 1 continue else: total = 0 if total >= 3: dbo.closeDb() return False '''item weibo text div''' #get retweet contentStr1 = item.renderContents() i=contentStr1.find(r'>转发[') if i>0: retweetNum = int(contentStr1[i+8]) if retweetNum>0: hr=contentStr1[:i].split(" ")[-1] rturl=hr.split(r'"')[1] if rturl.find('cmt')<0: rturl=rturl.replace('amp;','') page=1 go = True while (go): url=rturl+'&page='+str(page) go = self.getretweet(url,idstr) page+=1 '''get time begin''' subs = item.findAll("span",{"class":"ct"}) for t in subs[0].findAll(True): t.hidden = True substr = subs[0].renderContents() substr = substr.strip().split(" ")[0] timeStr = '' if substr.find('分钟前') > -1: timeStr = time.strftime("%Y-%m-%d %H:%M:00",time.localtime(time.time())) elif substr.find('今天') > -1: timeStr = time.strftime("%Y-%m-%d ",time.localtime(time.time())) + substr.split(" ")[1] elif substr.find('月') > -1: dateStr = filter(str.isdigit, substr.split(" ")[0]) timeStr = time.strftime("%Y-",time.localtime(time.time())) + dateStr[0:2] + "-" + dateStr[2:] + " " + substr.split(" ")[1] + ":00" else: timeStr = substr '''get time end''' for sub in subs: sub.extract() for tag in item.findAll(True): tag.hidden = True contentStr = item.renderContents() if contentStr.find("转发了") >= 0: continue contentStr = contentStr.replace(" ","") contentStr = contentStr.replace("<!-- -->","") result, number = re.subn('赞.*收藏', "", contentStr) del number splitIndex = result.find(':') userName = result[0:splitIndex] content = result[splitIndex+1:] value = [None,pid,idstr,userName,content,timeStr] try: dbo.insert(value) except: pass dbo.closeDb() return True