コード例 #1
0
def getSeedUserList():
    dbo = dbOperP.dbOperator()
    results = dbo.selectData("select id, uName, Wid from seed_users")
    dbo.closeDb()
    return results
コード例 #2
0
def getSeedUserList():
    dbo = dbOperP.dbOperator()
    results = dbo.selectData("select id, uName, Wid from seed_users")
    dbo.closeDb()
    return results
コード例 #3
0
    def extractTopic(self, searchStr, pid, unicodePage, firstPage):  #抓取每一条微博
        soup = BeautifulSoup(unicodePage)
        items = soup.findAll("div", {"class": "c"})
        dbo = dbOperP.dbOperator()

        total = 0

        removeFirst = False

        for item in items:
            idstr = item.get("id", None)
            if idstr:  #weibo id 'M_AeJs808NB'
                if firstPage and removeFirst == False:
                    removeFirst = True
                    continue
                seleResult = dbo.selectData(
                    "select pid from data where wbid = \'%s\'" % idstr)
                if len(seleResult) > 0:
                    opid = seleResult[0][0]
                    if opid == pid:
                        total += 1
                        continue
                    else:
                        total = 0
                    if total >= 3:
                        dbo.closeDb()
                        return False
                '''item weibo text div'''

                #get retweet
                contentStr1 = item.renderContents()
                i = contentStr1.find(r'>转发[')
                if i > 0:
                    retweetNum = int(contentStr1[i + 8])
                    if retweetNum > 0:
                        hr = contentStr1[:i].split(" ")[-1]
                        rturl = hr.split(r'"')[1]

                        if rturl.find('cmt') < 0:
                            rturl = rturl.replace('amp;', '')
                            page = 1
                            go = True
                            while (go):
                                url = rturl + '&page=' + str(page)
                                go = self.getretweet(url, idstr)
                                page += 1
                '''get time begin'''
                subs = item.findAll("span", {"class": "ct"})
                for t in subs[0].findAll(True):
                    t.hidden = True
                substr = subs[0].renderContents()
                substr = substr.strip().split("&nbsp;")[0]
                timeStr = ''
                if substr.find('分钟前') > -1:
                    timeStr = time.strftime("%Y-%m-%d %H:%M:00",
                                            time.localtime(time.time()))
                elif substr.find('今天') > -1:
                    timeStr = time.strftime(
                        "%Y-%m-%d ", time.localtime(
                            time.time())) + substr.split(" ")[1]
                elif substr.find('月') > -1:
                    dateStr = filter(str.isdigit, substr.split(" ")[0])
                    timeStr = time.strftime("%Y-", time.localtime(
                        time.time())) + dateStr[0:2] + "-" + dateStr[
                            2:] + " " + substr.split(" ")[1] + ":00"
                else:
                    timeStr = substr
                '''get time end'''
                for sub in subs:
                    sub.extract()
                for tag in item.findAll(True):
                    tag.hidden = True
                contentStr = item.renderContents()
                if contentStr.find("转发了") >= 0:
                    continue
                contentStr = contentStr.replace("&nbsp;", "")
                contentStr = contentStr.replace("<!-- -->", "")
                result, number = re.subn('赞.*收藏', "", contentStr)
                del number
                splitIndex = result.find(':')
                userName = result[0:splitIndex]
                content = result[splitIndex + 1:]
                value = [None, pid, idstr, userName, content, timeStr]
                try:
                    dbo.insert(value)
                except:
                    pass
        dbo.closeDb()
        return True
コード例 #4
0
 def getSearchList(self):  #得到搜索内容list 你需要根据需要自己重写此方法
     dbo = dbOperP.dbOperator()
     results = dbo.selectData("select name,pid from peopleinfor")
     dbo.closeDb()
     return results
コード例 #5
0
 def getSearchList(self):#得到搜索内容list 你需要根据需要自己重写此方法
     dbo = dbOperP.dbOperator()
     results = dbo.selectData("select name,pid from peopleinfor")
     dbo.closeDb()
     return results
コード例 #6
0
    def extractTopic(self,searchStr,pid,unicodePage,firstPage):#抓取每一条微博
        soup = BeautifulSoup(unicodePage)
        items = soup.findAll("div",{"class":"c"})
        dbo = dbOperP.dbOperator()

        total = 0

        removeFirst = False

        for item in items:
            idstr = item.get("id",None)
            if idstr:#weibo id 'M_AeJs808NB'
                if firstPage and removeFirst == False:
                    removeFirst = True
                    continue
                seleResult = dbo.selectData("select pid from data where wbid = \'%s\'" % idstr)
                if len(seleResult) > 0:
                    opid = seleResult[0][0]
                    if opid == pid:
                        total += 1
                        continue
                    else:
                        total = 0
                    if total >= 3:
                        dbo.closeDb()
                        return False

                '''item weibo text div'''

                #get retweet
                contentStr1 = item.renderContents()
                i=contentStr1.find(r'>转发[')    
                if i>0:
                    retweetNum = int(contentStr1[i+8])
                    if retweetNum>0:
                        hr=contentStr1[:i].split("&nbsp;")[-1]
                        rturl=hr.split(r'"')[1]

                        if rturl.find('cmt')<0:
                            rturl=rturl.replace('amp;','')
                            page=1
                            go = True
                            while (go):
                                url=rturl+'&page='+str(page)
                                go = self.getretweet(url,idstr)
                                page+=1

                '''get time begin'''
                subs = item.findAll("span",{"class":"ct"})
                for t in subs[0].findAll(True):
                    t.hidden = True
                substr = subs[0].renderContents()
                substr = substr.strip().split("&nbsp;")[0]
                timeStr = ''
                if substr.find('分钟前') > -1:
                    timeStr = time.strftime("%Y-%m-%d %H:%M:00",time.localtime(time.time()))
                elif substr.find('今天') > -1:
                    timeStr = time.strftime("%Y-%m-%d ",time.localtime(time.time())) + substr.split(" ")[1]
                elif substr.find('月') > -1:
                    dateStr = filter(str.isdigit, substr.split(" ")[0])
                    timeStr = time.strftime("%Y-",time.localtime(time.time())) + dateStr[0:2] + "-" + dateStr[2:] + " " + substr.split(" ")[1] + ":00"
                else:
                    timeStr = substr
                '''get time end'''
                for sub in subs:
                    sub.extract()
                for tag in item.findAll(True):
                    tag.hidden = True
                contentStr = item.renderContents()
                if contentStr.find("转发了") >= 0:
                    continue
                contentStr = contentStr.replace("&nbsp;","")
                contentStr = contentStr.replace("<!-- -->","")
                result, number = re.subn('赞.*收藏', "", contentStr)
                del number
                splitIndex = result.find(':')
                userName = result[0:splitIndex]
                content = result[splitIndex+1:]
                value = [None,pid,idstr,userName,content,timeStr]
                try:
                    dbo.insert(value)
                except:
                    pass
        dbo.closeDb()
        return True