def catch(account_id):
        # 关键就在这里了
        u"""

        :param target_url: https://xueqiu.com/4065977305
        :return:
        """
        mock_sleep_time = 0.5

        article_url_index_list = []
        #   获取最大页码
        # url = 'http://chuansong.me/account/{}'.format(account_id)
        # front_page_content = Http.get_content(url)
        # max_page = XueQiuWorker.parse_max_page(front_page_content)

        column_info = {}
        column_info[u'column_id'] = account_id
        column_info[u'title'] = ""
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        strtT = '1558513651020'

        # https://api.wallstreetcn.com/apiv1/content/themes/stream/1005680?type=newest&cursor=1558066610478&limit=20

        max_page = 2
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(1, max_page):
            resuorcecatch(account_id, strtT)
Esempio n. 2
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = FileColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "毛泽东军事文选"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址

        path = '/Users/ink/Desktop/ht'

        list = os.listdir(path)  #列出文件夹下所有的目录与文件
        for i in list:
            # print i

            if str(i).endswith('htm') or str(i).endswith('html'):
                filename = u'/Users/ink/Desktop/ht/{}'.format(i)
                convert_encoding(filename, 'utf-8')
                f = open(filename)
                contents = f.read()
                # print(contents)
                # gb2312 转
                article_info = FileArticleParser(contents).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = i
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])

                f.close()
    def catch(account_id):
        # 关键就在这里了

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1'
        front_page_content = Http.get_content(url)
        star_page = 1

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                if str(line).__contains__('#'):
                    split_url = line.split('#')[0]
                    trgId = split_url.split('/')[-2]
                    if trgId == account_id:
                        pg = (split_url.split('/')[-1])
                        print pg
                        star_page = int(pg)

                        if star_page == 0:
                            star_page = 1
                        else:
                            print star_page

        max_page = 2
        dom = BeautifulSoup(front_page_content, "lxml")
        list_pcyc_l_ = dom.find_all('div', class_="left t_page01")
        try:
            for tgo_tgo_ in list_pcyc_l_:
                linkl = tgo_tgo_.findAll('a')
                tarUrl = linkl[0].get('href')
                max_page = int(tarUrl.split('/')[3])

        except IndexError as e:
            max_page = 1
        column_info = TGBColumnParser(front_page_content).get_column_info()

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        # 获取每一页中文章的地址的地址
        # star_page = 100
        for raw_front_page_index in range(star_page, max_page + 1):
            request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str(
                raw_front_page_index)
            article_url_index_list.append(request_url)

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TGBArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 4
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = Todo3ColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "新能源汽车"
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 1
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('div',
                                            class_='list-border clearfix')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    li = list_pcyc_li[0]

                    tarUrl = li.get('href')
                    ttt = str(tarUrl).split("#")[-1]
                    print ttt
                    if not (ttt is None):
                        article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo3ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 5
0
    def catch(account_id):
        # 关键就在这里了
        u"""

        :param target_url: https://xueqiu.com/4065977305
        :return:
        """
        mock_sleep_time = 0.5

        article_url_index_list = []
        #   获取最大页码
        # url = 'http://chuansong.me/account/{}'.format(account_id)
        # front_page_content = Http.get_content(url)
        # max_page = XueQiuWorker.parse_max_page(front_page_content)

        # _url = "http://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=2" ''是all  2主贴  5 回复
        _url = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=0"
        first = _url.format(account_id, 1)
        r = Http.get_json_content(first)
        max_page = 1
        try:
            jdata = json.loads(r.text, encoding='utf-8')
            max_page = jdata['maxPage'] + 1
        except KeyError as   e:
            print  '打开失败 >>>>>>> Cookie'
        # max_page = 1
        #   分析网页内容,存到数据库里
        #   需要验证码

        content_profile = Http.get_content(u'https://xueqiu.com/u/{}/profile'.format(account_id))

        column_info = XueQiuColumnParser(content_profile).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = ""
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    column_info[u'title'] = line.split('#')[1]
                    column_info[u'image_url'] = str(line.split('#')[2]).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))

        #

        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(1, max_page):
            request_url = _url.format(account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                 article_count=len(index_work_set)))

                content = Http.get_content(request_url)
                if not content:
                    return
                jdata = json.loads(content)
                articles = jdata['statuses']
                for article in articles:
                    # print article

                    article_info = XueQiuArticleParser(article).get_article_info()
                    if len(article_info) > 0:
                        article_info['column_id'] = account_id
                        Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]

                Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url))

        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id)

        column_info = JinWanKanSaEmptColumnParser('').get_column_info()

        column_info[u'column_id'] = account_id
        dt = datetime.datetime.now()
        column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d"))
        max_page = 1

        typeToTry = 'tag'

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    dt = datetime.datetime.now()
                    column_info[u'title'] = u"{}_{}".format(
                        line.split('#')[1], dt.strftime("%Y-%m-%d"))

                    max_page = int(line.split('#')[2])

                    typeToTry = str(int(line.split('#')[-1])).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index)
            request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format(
                typeToTry, account_id, raw_front_page_index)
            print request_url
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'html.parser')
                list_p_list = soup.find_all('span', class_="item_title")

                for tgo_right in list_p_list:
                    for link in tgo_right.findAll('a'):
                        ttt = str(link.get('href'))
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(
                        0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                article_info = JinWanKanSaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.gushequ.com/{}/'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = TodoColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "股社区"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 24
        if account_id == '2018':
            star_page = 0
            max_page = 24

        elif account_id == '2017':
            star_page = 24
            max_page = 58

        elif account_id == '2016':
            star_page = 58
            max_page = 92

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'http://www.gushequ.com/page/{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:

                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TodoArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 8
0
    def catch(account_id):
        # 关键就在这里了
        u"""

        :param target_url: https://xueqiu.com/4065977305
        :return:
        """
        mock_sleep_time = 0.5
        base_sleep_time = 5
        max_sleep_time = 30

        article_url_index_list = []

               #https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol=SZ000333&hl=0&source=all&sort=&page=1&q=

        _url = "https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol={0}&hl=0&source=all&sort=alpha&page={1}&q="

        # 搜索 霍华德·马克斯
        # _url = "https://xueqiu.com/statuses/search.json?sort=relevance&source=all&q={0}&count=10&page={1}"

        first = _url.format(account_id, 1)
        r = Http.get_json_content(first)
        max_page = 1
        try:
            jdata = json.loads(r.text, encoding='utf-8')
            max_page = jdata['maxPage'] + 1
        except KeyError as   e:
            print  '打开失败 >>>>>>> Cookie'
        # max_page = 1
        #   分析网页内容,存到数据库里
        #   需要验证码

        max_page = 1
        # print max_page


        column_info = XueQiuColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = ""

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:

                    dt = datetime.datetime.now()
                    tit = line.split('#')[1]
                    column_info[u'title'] = u"{}_{}".format(tit, dt.strftime("%Y-%m-%d"))

                    column_info[u'image_url'] = str(line.split('#')[2]).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))

        #

        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(1, max_page):
            request_url = _url.format(account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                 article_count=len(index_work_set)))


                print  request_url
                content = Http.get_content(request_url)
                if not content:

                    random_sleep_time = base_sleep_time + random.randint(2, max_sleep_time) / 10.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue


                    # {"error_description":"您的请求过于频繁,请稍后再试","error_uri":"/statuses/search.json","error_code":"22612"}


                jdata = json.loads(content)


                if jdata.has_key('error_code'):
                    random_sleep_time = base_sleep_time + random.randint(3, max_sleep_time) / 10.0
                    Debug.logger.info(u"error_description {}秒".format(jdata['error_description']))
                    time.sleep(random_sleep_time)
                    continue

                articles = jdata['list']

                for article in articles:
                    # print article

                    article_info = XueQiuArticleParser(article).get_article_info()
                    if len(article_info) > 0:
                        article_info['column_id'] = account_id
                        Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]

                random_sleep_time = 1 + random.randint(3, max_sleep_time) / 10.0
                Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                time.sleep(random_sleep_time)

                Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url))

        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 28
        base_sleep_time = 62
        max_sleep_time = 80

        article_url_index_list =[]
        #   获取最大页码
        url = 'http://chuansong.me/account/{}'.format(account_id)
        # front_page_content = Http.get_content(url)
        front_page_content =''
        # max_page =WechatWorker.parse_max_page(front_page_content)
        # if max_page > 200:
        #     max_page =200
        max_page = 0
        #   分析网页内容,存到数据库里
        column_info = WechatColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id

        with open('ReadList.txt', 'r') as read_list:
             read_list = read_list.readlines()
             for line in read_list:
                 split_url = line.split('#')[0]
                 if str(split_url).__contains__(account_id):
                    column_info[u'title'] = str(line.split('#')[1])


        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            front_page_index = raw_front_page_index * 12
            request_url = url + '?start={}'.format(front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                        u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(raw_front_page_index=raw_front_page_index, max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0 or catch_counter % 5 == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue


                random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 10.0
                Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                time.sleep(random_sleep_time)

                article_url_index_list += Match.wechat_article_index(content=request_url_content)
                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            article_db = DB.query_row('select count(*) as article_count from Article where article_id = {}'.format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = 'http://chuansong.me/n/{}'.format(article_url_index)
            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                 article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue

                random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 10.0
                Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                time.sleep(random_sleep_time)

                article_info = WechatArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.wuxiareview.com/category/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = WuXiaColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        max_page = 2
        if account_id == 'daidai':

            column_info[u'title'] = "吃瓜群众岱岱"
            max_page = 1
        elif account_id == 'gzmdzst':

            column_info[u'title'] = "顾子明的政事堂"
            max_page = 1
        else:

            column_info[u'title'] = "时文"
            max_page = 2

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('article', class_="excerpt")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        # print li.text
                        tarUrl = li.get('href')
                        ttt = str(tarUrl).split("#")[-1]
                        print ttt
                        if not (ttt is None):
                            article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = WuXiaArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        star_page = 1
        max_page = 1
        column_info = Todo1ColumnParser("").get_column_info()
        column_info[u'column_id'] = account_id

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if str(split_url).__contains__(account_id):
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    column_info[u'title'] = str(line.split('#')[1])

                    # max_page = 1
                    print max_page

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format(
                account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('h4', class_="module-title")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        ttt = li.get('href')
                        print ttt
                        if not (ttt is None):

                            ss = str(ttt).split('.')
                            article_url_index_list.append(
                                u"https://www.guancha.cn{}_s.{}".format(
                                    ss[0], ss[1]))

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo1ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 12
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #
        url = 'http://xinsheng.huawei.com/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        column_info = HuaWeiColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "华为家事"
        column_info[
            u'image_url'] = 'file:///Users/ex-liyan010/Desktop/share/hcover.jpeg'

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        max_page = 0

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            request_url = u'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=List&act=index&class=461&order=cTime&type=&sign=&special=&cate=155&p={}'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, "lxml")
                content_dom = soup.find_all('div', class_="bbs_list")[0]

                # print content_dom.text
                #     t_dom = dom.find_all('tbody')
                #     # print t_dom

                font_box_dom = content_dom.find_all('div', class_="font_box")
                # print time_dom
                for xx in font_box_dom:
                    linkl = xx.findAll('a')

                    tarUrl = linkl[0].get('href')
                    print tarUrl
                    article_url_index_list.append(tarUrl)

                del index_work_set[raw_front_page_index]

        article_url_index_list.append(
            'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=Detail&act=index&id=4343641'
        )
        article_url_index_list.append(
            'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=Detail&act=index&id=4340813'
        )
        article_url_index_list.append(
            'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4346331'
        )
        article_url_index_list.append(
            'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4347493'
        )
        article_url_index_list.append(
            'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4342141'
        )

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = HuaWeiArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def write_txt(self):
        try:
            if self.filter:
                result_header = u"\n\n原创微博内容: \n"
            else:
                result_header = u"\n\n微博内容: \n"
            result = (u"用户信息\n用户昵称:" + self.username + u"\n用户id: " +
                      str(self.user_id) + u"\n微博数: " + str(self.weibo_num) +
                      u"\n关注数: " + str(self.following) + u"\n粉丝数: " +
                      str(self.followers) + result_header)

            column_info = {}
            column_info[u'column_id'] = str(self.user_id)
            column_info[u'title'] = self.username
            column_info['article_count'] = str(self.weibo_num)
            column_info['follower_count'] = str(self.following)
            column_info['description'] = ''
            column_info['image_url'] = ''
            from src.worker import Worker
            Worker.save_record_list(u'Column', [column_info])

            for i in range(1, self.weibo_num2 + 1):
                text = (str(i) + ":" + self.weibo_content[i - 1] + "\n" +
                        u"微博位置: " + self.weibo_place[i - 1] + "\n" +
                        u"发布时间: " + self.publish_time[i - 1] + "\n" +
                        u"点赞数: " + str(self.up_num[i - 1]) + u"   转发数: " +
                        str(self.retweet_num[i - 1]) + u"   评论数: " +
                        str(self.comment_num[i - 1]) + "\n" + u"发布工具: " +
                        self.publish_tool[i - 1] + "\n\n")
                result = result + text

                article_info = {}
                from src.worker import Worker

                article_info['article_id'] = u'{}'.format(i)
                article_info['column_id'] = str(self.user_id)
                article_info['title'] = u'{}'.format(self.weibo_num2 + 1 - i)
                article_info['content'] = self.weibo_content[i - 1]
                article_info['updated_time'] = self.publish_time[i - 1]

                article_info['voteup_count'] = ""
                article_info['comment_count'] = ""
                article_info['image_url'] = ''
                article_info['author_id'] = str(self.user_id)
                article_info['author_name'] = self.username
                article_info['author_headline'] = ''
                article_info[
                    'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg'
                article_info['author_gender'] = '0'

                Worker.save_record_list(u'Article', [article_info])

            # file_dir = os.path.split(os.path.realpath(__file__))[
            #                0] + os.sep + "weibo"
            # if not os.path.isdir(file_dir):
            #     os.mkdir(file_dir)
            # file_path = file_dir + os.sep + "%d" % self.user_id + ".txt"
            # f = open(file_path, "wb")
            # f.write(result.encode(sys.stdout.encoding))
            # f.close()
            # print(file_path)
            print(u"微博写入文件完毕")
        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()
Esempio n. 14
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = Todo2ColumnParser("").get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "纽约时报"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 0

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://cn.nytimes.com/search/data/?query=DAVID%20BARBOZA&lang=&dt=json&from={}&size=10'.format(
                raw_front_page_index * 10)
            # request_url = u'https://cn.nytimes.com/real-estate/{}/'.format(raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                content = Http.get_content(request_url)

                # json 返回中 url

                if not content:
                    return
                jdata = json.loads(content)
                articles = jdata['items']
                for article in articles:
                    print article['headline']
                    uur = article['web_url_with_host']

                    print uur

                    article_url_index_list.append(uur)

                # soup = BeautifulSoup(content, 'lxml')
                # list_p_list = soup.find_all('h3' ,class_="regularSummaryHeadline")
                # for p in list_p_list:
                #     # print p
                #     list_pcyc_li = p.find_all('a')
                #     for li in list_pcyc_li:
                #
                #         tarUrl = str(li.get('href'))
                #         print  tarUrl
                #
                #         if not (tarUrl is None):
                #             if str(tarUrl).__contains__("cn.nytimes.com"):
                #                 article_url_index_list.append(u"https:{}".format(tarUrl))
                #             else:
                #                 article_url_index_list.append(u"https://cn.nytimes.com{}".format(tarUrl))

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo2ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.360doc.com/userhome/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        # Config.now_id_likeName = account_id
        # Config.save()

        column_info = Doc360ColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "明公"
        max_page = 2
        # if account_id == 'daidai':
        #
        #     column_info[u'title'] = "吃瓜群众岱岱"
        #     max_page = 1
        # elif account_id == 'gzmdzst':
        #
        #     column_info[u'title'] = "顾子明的政事堂"
        #     max_page = 1
        # else:
        #
        #     column_info[u'title'] = "时文"
        #     max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page):
            request_url = u"http://www.360doc.com/ajax/getUserArticle.aspx?pagenum=50&curnum={}&icid=13&ishowabstract=null&word=&userid={}&isoriginal=0&_={}"
            urequest_url = (request_url.format(raw_front_page_index,
                                               account_id,
                                               int(time.time() * 1000)))

            index_work_set[raw_front_page_index] = urequest_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('div',
                                            class_="list listwz1 font14")
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:

                        # print li.text
                        tarUrl = li.get('href')
                        # deep level
                        print tarUrl
                        article_url_index_list.append(tarUrl)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Doc360ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 16
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        column_info = WeiXinColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id
        column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240'
        max_page = 1
        # with open('ReadList.txt', 'r') as read_list:
        #     read_list = read_list.readlines()
        #     for line in read_list:
        #         split_url = line.split('#')[0]
        #         if str(split_url).__contains__(account_id):
        #             # Config.now_id_likeName = line.split('#')[1]
        #             max_page = int(line.split('#')[-1]) + 1
        #             column_info[u'title'] = str(line.split('#')[1])
        #
        #             # max_page = 1
        #             print max_page



        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))


        # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect')
        # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ')

        with open('/Users/0/Desktop/list.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                article_url_index_list.append(str(line).strip('\n'))

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print  'query : ' + article_url_index
            article_db = DB.query_row(
                    'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                    article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                #article_info = Todo2ArticleParser(request_url_content).get_article_info()
                # article_info = HuXiuArticleParser(request_url_content).get_article_info()
                article_info = WeiXinArticleParser(request_url_content).get_article_info()
                # article_info = WallStreetArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.huxiu.com/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        # Config.now_id_likeName = account_id
        # Config.save()

        column_info = HuXiuColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        u_result = urllib.quote(
            account_id.decode(sys.stdin.encoding).encode('utf8'))
        print account_id
        max_page = 2

        idds = ''
        #
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    idds = str(line.split('#')[1])
                    print max_page
        max_page = -1
        #   分析网页内容,存到数据库里

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc
            request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format(
                u_result, raw_front_page_index)
            #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index)
            # request_url = 'https://www.huxiu.com/member/1872007.html'
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, "lxml")

                list_pcyc_l_ = soup.find_all('li')
                # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt')
                for tgo_right in list_pcyc_l_:
                    for link in tgo_right.findAll('a'):
                        hre = str(link.get('href'))
                        if hre.startswith('/article/', 0, 10):
                            print u'https://www.huxiu.com{}'.format(
                                link.get('href'))
                            article_url_index_list.append(
                                'https://www.huxiu.com{}'.format(
                                    link.get('href')))

                del index_work_set[raw_front_page_index]

        article_url_index_list.append(
            'https://www.huxiu.com/article/299355.html')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = HuXiuArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 18
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5

        article_url_index_list = []
        #   获取最大页码
        url = 'http://blog.sina.com.cn/s/articlelist_{}_11_1.html'.format(account_id)
        front_page_content = Http.get_content(url)
        article_num = SinaWorker.parse_max_page(front_page_content)

        href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(account_id)
        content_profile = Http.get_content(href_profile)

        column_info = SinaColumnParser(content_profile).get_column_info()
        column_info[u'column_id'] = account_id

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])


        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址


        if article_num % 50 != 0:
            page_num = article_num / 50 + 1  # 50 href on 1 page
        else:
            page_num = article_num / 50
        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=page_num))
        index_work_set = OrderedDict()
        for page in range(page_num):
            url = 'http://blog.sina.com.cn/s/articlelist_{}_11_{}.html'.format(account_id, page + 1)
            content_article_list = Http.get_content(url)

            soup = BeautifulSoup(content_article_list, "lxml")

            article_list = soup.select('span.atc_title a')
            for item in range(len(article_list)):
                article_title = ParserTools.get_attr(article_list[item], 'href')

                index_work_set[item] = article_title

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                 article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    Debug.logger.info(u"休眠{}秒".format(mock_sleep_time))
                    time.sleep(mock_sleep_time)
                    continue

                article_info = SinaArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
            del index_work_set[article_url_index]
        return