Python Queue.add_url_tweets Examples

Programming Language: Python

Namespace/Package Name: Queue

Class/Type: Queue

Method/Function: add_url_tweets

Examples at hotexamples.com: 1

Python Queue.add_url_tweets - 1 examples found. These are the top rated real world Python examples of Queue.Queue.add_url_tweets extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Queue(30)

add(30)

__init__(30)

_get(14)

_put(11)

append(8)

Dequeue(6)

Enqueue(6)

check_last(5)

check_first(5)

_init(3)

_qsize(2)

addBack(2)

addDataset(2)

clear(2)

__repr__(2)

Append(2)

add_url_host(2)

back(2)

IsEmpty(2)

sleep(2)

Contains(2)

not_empty(2)

checkSize(1)

canAcceptAndIsRequested(1)

canAccept(1)

Add(1)

closed(1)

connect_with(1)

consoleOut(1)

consumeQ(1)

find(1)

isPresent(1)

last(1)

maxsize(1)

not_full(1)

peak(1)

popleft(1)

progress(1)

quit(1)

average(1)

add_connection(1)

after(1)

_Push(1)

GetQueue(1)

NQVD_Counter(1)

Pop(1)

PriorityQueue(1)

Remove(1)

Size(1)

Example #1

Show file

File: Parser.py Project: xxpand/weibo_spider

class Parser:
    def __init__(self):
        self.list = list()
        self.comments = []  # 评论结果
        self.comments_userurl = []  # 从评论里面爬取到的用户url
        self.likes = []  # 赞爬取结果
        self.queue = Queue()  # 爬取队列对象
        self.db = DB()

    def parse_numberOfTweet(self, html):  # 获取每一页微博的url
        dic = {}
        soup = BeautifulSoup(html, 'lxml')
        base_url = soup.find('td').contents[0]['href']
        url = re.search('/[\d*]{1,}', base_url).group() + "?page="
        base_page = soup.find('span', attrs={"class": "tc"}).text
        page_amount = int(
            re.search('[\d]{1,}', base_page, re.M | re.I).group()) / 10 + 1
        dic["url"] = "https://weibo.cn/u" + url
        dic["page_amount"] = page_amount
        self.queue.add_url_tweets(dic)

    # TODO:解析整个页面
    def parse_fpage(self, html):  # 获取评论页面url
        soup = BeautifulSoup(html, 'lxml')

        for i in soup.find_all('a', attrs={"class": "cc"}):
            if "原文评论" in i.text:
                pass
            else:
                dic = {}
                dic["url"] = re.search('https://[a-z]{1,}.cn/comment/.*[?]',
                                       i["href"],
                                       re.M | re.I).group() + "page="
                num = int(re.search('[\d]+', i.text).group())
                if num == 0:
                    pass
                else:
                    dic["page_amount"] = num / 10 + 1
                    print(dic)
                    self.queue.add_url_comments(dic)

    # TODO:解析评论页面
    def parse_comments(self, html):  # 获取评论页面的内容

        soup = BeautifulSoup(html, "lxml")
        self.comments = []  # 将评论list清空
        self.comments_userurl = []  # 把用户urllist清空
        like_dic = {}
        comments_html = soup.find_all('div', id=re.compile("C_"))
        host_name = soup.find("a", attrs={"class": ""}).text[2:-3]
        like_soup = soup.find('a', attrs={'href': re.compile('/atti')})
        like_dic['url'] = "https://weibo.cn" + str(
            re.search("/attitu[\w]{1,}/[\w]{1,}\?",
                      like_soup['href']).group()) + "page="
        like_dic['page_amount'] = int(
            re.search('[\d]+', like_soup.text).group()) / 10 + 1

        print(like_dic)
        self.queue.add_url_likes(like_dic)

        for content in comments_html:
            comment_dict = dict()
            comment_dict["comment_content"] = content.find(attrs={
                "class": "ctt"
            }).text  # 获取评论内容
            comment_dict["comment_username"] = content.find(
                "a").text  # 获取评论者的昵称
            comment_user_url = content.find("a").attrs["href"]  # 获取评论者的链接
            self.queue.add_url_host("https://weibo.cn" +
                                    comment_user_url)  # 取评论用户url做为爬取队列
            print(comment_dict, comment_user_url)

            self.comments.append(comment_dict)

        self.db.insert(host_name, self.comments)  # 插入数据库

    # TODO:解析点赞页面
    def parse_likes(self, html):  # 获取点赞页面的内容
        self.likes = []
        likes = []
        soup = BeautifulSoup(html, "lxml")
        host_name = soup.find("div", attrs={"class": ""}).text.split(":")[0]
        n = 0
        for i in soup.find_all(attrs={"class": "ct"}):
            n += 1
            if n == 1:
                continue
            else:
                likes.append(i.previous_sibling.previous_sibling.text)
        self.db.insert_likes(host_name, likes)