Python ProxyPool.get_proxy Examples

Programming Language: Python
Namespace/Package Name: proxypool
Class/Type: ProxyPool
Method/Function: get_proxy
Examples at hotexamples.com: 1
Python ProxyPool.get_proxy - 1 examples found. These are the top rated real world Python examples of proxypool.ProxyPool.get_proxy extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
ProxyPool(9)
get(2)
delete_proxy(1)
get_many(1)
get_mtime(1)
get_proxy(1)
Example #1
Show file
File: program_second_classifyer.py Project: zg-diligence/behavior_analysis
class Scrapyer(object):
    """
    Function:
        1.search programs from http://www.tvmao.com and collect relative programs info
        2.crawl detail info of programs from http://www.tvmao.com to help classify programs
    """
    def __init__(self):
        self.retry_count = 3
        self.empty_count = 0
        self.pre_empty_flag = False

        self.enabled_programs = []
        self.unabled_programs = []
        self.collected_programs = []

        self.proxypool = ProxyPool()
        self.proxy = self.proxypool.get_proxy()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }

    def change_proxy(self):
        """
        change current proxy
        :return:
        """

        self.proxypool.delete_proxy(self.proxy)
        self.proxy = self.proxypool.get_proxy()

    def check_empty(self, num, source_programs, lock):
        """
        check whether if the current proxy is dead
        if the res '[]' occurs over 5 times consecutively
        :param num: number of current columns
        :param source_programs: programs need to crawl
        :param lock: lock to access the source_programs
        :return:
        """

        if num == 0:
            if self.pre_empty_flag:
                self.empty_count += 1
                if self.empty_count >= 5:
                    for i in range(5, 0, -1):
                        program = self.unabled_programs[i]
                        if empty_times[program] < 2:
                            self.unabled_programs.pop(i)
                            with lock:
                                source_programs.put(program)
                                empty_times[program] += 1
                    self.change_proxy()
                    self.empty_count = 0
            else:
                self.pre_empty_flag = True
                self.empty_count = 1
        elif self.pre_empty_flag:
            self.pre_empty_flag = False
            self.empty_count = 0

    def collect_programs(self, page_uls, page_columns):
        """
        parse programs from the crawed result by columns
        :param page_uls: all uls in the result
        :param page_columns: all categories in the result
        :return:
        """

        prefix = 'http://www.tvmao.com'

        programs = []
        for column, uls in zip(page_columns, page_uls):
            lis = uls.find_all('li', class_='mr10')
            if len(lis) == 0: continue
            if re.search('^(电视剧|电影)', column):
                href_names = [(prefix + li.p.a['href'], li.p.a.get_text())
                              for li in lis]
            elif re.search('^(综艺|明星|赛事)', column):
                href_names = [(prefix + li.a['href'], li.a['title'])
                              for li in lis]
            else:
                continue
            programs.append(href_names)
        return dict(zip(page_columns, programs))

    def crawl_relative_program(self, program, source_programs, lock):
        """
        crawl relative programs info from http://www.tvmao.com
        :param program:
        :param source_programs: all programs need to crawl
        :param lock: lock to access the source_programs
        :return:
        """

        url = 'http://www.tvmao.com/query.jsp?keys=%s&ed=' % quote(program) + \
              'bOWkp%2BeZveWkq%2BWmh%2BS4iua8lOazoeayq%2BS5i%2BWQu28%3D'

        # crawl the website
        bsObj = None
        self.retry_count = 3
        while self.retry_count > 0:
            try:
                content = requests.get(url,
                                       proxies={'http': self.proxy},
                                       headers=self.headers,
                                       timeout=2)
                bsObj = BeautifulSoup(content.text, 'html.parser')
                break
            except:
                self.retry_count -= 1
                if self.retry_count <= 0:
                    if DEBUG: print("waiting...")
                    self.change_proxy()
                    self.retry_count = 3

        # parse infomation
        try:
            page_content = bsObj.find_all('div', class_='page-content')[0]
            page_columns = [
                item.a.get_text() for item in page_content.dl.find_all('dd')
            ]
            page_columns = [
                column for column in page_columns
                if not re.search('^(播出时间|电视频道)', column)
            ]
            page_content_uls = page_content.div.find_all(
                'ul', class_=re.compile('^.+qtable$'), recursive=False)
            if len(page_columns) == 0:
                self.unabled_programs.append(program)
            else:
                self.enabled_programs.append(program)
                column_programs = self.collect_programs(
                    page_content_uls, page_columns)
                return {program: column_programs}

            # check whether if the current proxy was dead
            self.check_empty(len(page_columns), source_programs, lock)
        except:
            with lock:
                source_programs.put(program)
            self.change_proxy()
            return None

    def run_crawl_relative_programs(self, source_programs, lock, limit=False):
        """
        single process
        :param source_programs: all programs need to crawl
        :param lock: lock to access the source_programs
        :param limit: if size of source_programs has little change, end process when limit is true
        :return: collected programs info, enabled programs, unabled programs
        """

        collected_programs = []
        # count, pre = 0, source_programs.qsize()
        while True:
            try:
                with lock:
                    program = source_programs.get_nowait()
                if DEBUG: print(source_programs.qsize())

                if source_programs.qsize() < 1500:
                    return collected_programs, self.enabled_programs, self.unabled_programs
                # count += 1
                # if count % 50 == 0 and limit:
                #     if pre - source_programs.qsize() < 0:
                #         return collected_programs, self.enabled_programs, self.unabled_programs
                # pre = source_programs.qsize()

                result = self.crawl_relative_program(program, source_programs,
                                                     lock)
                if result: collected_programs.append(result)
                time.sleep(randint(0, 1))
            except:
                return collected_programs, self.enabled_programs, self.unabled_programs

    def category_classify(self, category):
        """
        classify by the category from xingchen
        :param category: program intro or program category from xingchen
        :return:
        """

        if re.search('军旅', category): return '军事'
        if re.search('纪录片', category): return '纪实'
        if re.search('动漫', category): return '少儿'
        if re.search('戏剧', category): return '戏曲'
        if re.search('真人秀', category): return '综艺'
        res = re.search('|'.join(all_categories), category)
        if res: return res.group()
        return None

    def intro_classify(self, intro):
        """
        classify the category 'living' into more accurate category
        :param intro: introduction of the realtive program in xingchen
        :return:
        """

        if re.search('军旅', intro): return '军事'
        if re.search('纪录片', intro): return '纪实'
        if re.search('动漫', intro): return '少儿'
        if re.search('戏剧', intro): return '戏曲'
        if re.search('真人秀', intro): return '综艺'
        res = re.search('|'.join(all_categories), intro)
        if res: return res.group()
        return "生活"

    def search_to_classify_program(self, href):
        """
        classify programs by crawling more detail info from xingchen
        :param href: link of the relative program in xingchen
        :return:
        """

        # crawling the website
        bsObj = None
        self.retry_count = 3
        while self.retry_count > 0:
            try:
                content = requests.get(href,
                                       proxies={'http': self.proxy},
                                       headers=self.headers,
                                       timeout=2)
                if content.status_code != 200:
                    self.retry_count -= 1
                    if self.retry_count <= 0:
                        if DEBUG: print("waiting...")
                        self.change_proxy()
                        self.retry_count = 3
                    continue
                bsObj = BeautifulSoup(content.text, 'html.parser')
                break
            except:
                self.retry_count -= 1
                if self.retry_count <= 0:
                    if DEBUG: print("waiting...")
                    self.change_proxy()
                    self.retry_count = 3

        # classify the program by detail info from website
        try:
            if re.search('tvcolumn', href):
                res_1 = bsObj.find_all('td', class_='gray pl15')
                if res_1:
                    category = res_1[0].findNext('td').get_text()
                    if category != "生活":
                        category = self.category_classify(category)
                        return category if category else '综艺'
                    div = bsObj.find_all('div', class_='clear more_c')[0]
                    intro = '; '.join(
                        [p.get_text() for p in div.find_all('p')])
                    return self.intro_classify(intro)
                else:
                    return '综艺'
            elif re.search('drama', href):
                mark = bsObj.find(text='类别：')
                td = mark.parent.findNext('td')
                category = ' '.join(
                    [a.get_text() for a in td.find_all('a', recursive=False)])
                category = self.category_classify(category)
                return category if category else '电视剧'
        except:
            if DEBUG: print("f**k", href)
            return choice(['综艺', '电视剧'])

    def run_search_to_classify_programs(self, source_items, lock):
        """
        single process
        :param source_items: all programs need to crawl more detail info
        :param lock: lock to access source_items
        :return:
        """

        program_cateogry = []
        while True:
            try:
                with lock:
                    item = source_items.get_nowait()
                if DEBUG: print(source_items.qsize())
                category = self.search_to_classify_program(item[2])
                program_cateogry.append((item[0], category))
                time.sleep(randint(0, 1))
            except:
                return program_cateogry