Example #1
0
 async def search_super_fan(self, super_fan_job_info):
     LOGGER.info('try to grab html: ' + super_fan_job_info['url'])
     html_content = await self.grab_html(super_fan_job_info['url'])
     userjson = json.loads(html_content, 'GBK')
     userjson = userjson['data']
     LOGGER.info('succeed to grab html: ' + super_fan_job_info['url'])
     for group in userjson['cards']:
         for card in group['card_group']:
             user_id = card['user']['id']
             await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'super'})
 async def fetch_job(self, job_type):
     if not self._pool:
         await self.init_pool()
     with await self._pool as conn:
         job_info = await conn.execute('rpop', job_type)
         if job_info:
             LOGGER.info('fetched job: %s' % job_info)
             return json.loads(job_info)
         else:
             return None
Example #3
0
    def save_cookies(cls, user_name, cookies):

        pickled_cookies = json.dumps({
            'user_name': user_name,
            'cookies': cookies,
            'login_time': datetime.datetime.now().timestamp()
        })
        LOGGER.info('save cookie in redis: %s' % str(pickled_cookies))
        r = redis.Redis(connection_pool=cls.redis_pool)
        r.hset('account', user_name, pickled_cookies)
        cls.user_in_queue(user_name)
Example #4
0
 async def search_topic_user(self, topic_job_info):
     LOGGER.info('try to grab html: ' + topic_job_info['url'])
     html_content = await self.grab_html(topic_job_info['url'])
     userjson = json.loads(html_content)
     userjson = userjson['data']
     LOGGER.info('succeed to grab html: ' + topic_job_info['url'])
     for group in userjson['cards']:
         if 'show_type' in group:
             for card in group['card_group']:
                 user_id = card['mblog']['user']['id']
                 await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'})
Example #5
0
 def __init__(self, username, password):
     self.YDMApi = windll.LoadLibrary(ROOT_PATH +
                                      '\\dll\\yundamaAPI-x64.dll')
     self.appId = 4296  # Èí¼þid
     self.appKey = b'fdacec8d9f1c2deb86346bfcf64e95f2'  # Èí¼þÃÜÔ¿
     LOGGER.info('app id£º%d\r\napp key£º%s' % (self.appId, self.appKey))
     self.username = username.encode()
     self.password = password.encode()
     self.code_type = 1005
     self.timeout = 60
     self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey)
Example #6
0
 async def search(self):
     while True:
         search_job_info = await self.redis_job.fetch_job(JobType.search.value)
         if search_job_info:
             try:
                 await self.search_tweet(search_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Example #7
0
 def search_topic_user_now(self, topic_job_info):
     LOGGER.info('try to grab html: ' + topic_job_info['url'])
     userjson = self.grab_html_now(topic_job_info['url'])
     LOGGER.info('succeed to grab html: ' + topic_job_info['url'])
     for group in userjson['cards']:
         if 'show_type' in group:
             for card in group['card_group']:
                 user_id = card['mblog']['user']['id']
                 self.redis_job_now.push_job(JobType.user.value, {
                     'user_id': user_id,
                     'source': 'comment'
                 })
Example #8
0
 async def crawl_repost(self):
     while True:
         repost_job_info = await self.redis_job.fetch_job(JobType.repost.value)
         if repost_job_info:
             try:
                 await self.grab_tweet_repost(repost_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error("something error")
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
def del_proc():
    status_file = os.path.join(BASE_DIR, "proc/status")
    if os.path.exists(status_file):
        os.unlink(status_file)
    if not os.path.exists(status_file):
        LOGGER.info("remove status sucessfully")
    else:
        LOGGER.error("Failed to remove status file")
    prometheus_dir = os.path.join(BASE_DIR, ".prometheus_multiproc_dir")
    for file_obj in os.listdir(prometheus_dir):
        file_path = os.path.join(prometheus_dir, file_obj)
        os.remove(file_path)
Example #10
0
 async def crawl_follow(self):
     while True:
         follow_dict = self.redis_job_now.fetch_job(JobType.follower.value)
         if follow_dict:
             try:
                 await self.grab_follow(follow_dict)
                 LOGGER.info('finish %d follow crawl ' % follow_dict['uid'])
             except TimeoutError as e:
                 print(e)
             except:
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
 async def push_job(self, job_type, job_info):
     if not self._pool:
         await self.init_pool()
     url = job_info.get('url', '')
     if url and url in self.url_filter:
         LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info)))
         return
     else:
         self.url_filter.add(url)
     with await self._pool as conn:
         await conn.execute('lpush', str(job_type), json.dumps(job_info))
         LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info)))
Example #12
0
    def recognize(self, filename):
        if not isinstance(filename, bytes):
            filename = filename.encode()
        result = c_char_p(b"                              ")
        LOGGER.info('>>>ÕýÔڵǽ...')
        captcha_id = self.YDMApi.YDM_EasyDecodeByPath(self.username,
                                                      self.password,
                                                      self.appId, self.appKey,
                                                      filename, self.code_type,
                                                      self.timeout, result)

        return captcha_id, result.value
Example #13
0
 def fetch_cookies(cls):
     # LOGGER.info('get cookies from reids')
     r = redis.Redis(connection_pool=cls.redis_pool)
     while True:
         user = r.spop('users')
         r.sadd('users', user)
         c = r.hget('account', user)
         if c:
             user_cookies = c.decode('utf-8')
             cookies_json = json.loads(user_cookies)
             # LOGGER.info(cookies_json)
             return cookies_json
         LOGGER.warn('cookies not get')
Example #14
0
 async def crawl_comment(self):
     while True:
         comment_job_info = await self.redis_job.fetch_job(JobType.comment.value)
         if comment_job_info:
             try:
                 # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop)
                 await self.grab_tweet_comments(comment_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error("something error")
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Example #15
0
def getStockList(force_update=False):
    if force_update:
        twstock.__update_codes()

    stocks = []
    for code in twstock.codes:
        stock = twstock.codes[code]
        if stock.type == '股票':
            stocks.append(stock)

    LOGGER.info(f'Get {len(stocks)} stocks in list')

    return stocks
Example #16
0
    async def crawl_follow(self):
        while True:
            follow_dict = await self.redis_job.fetch_job(JobType.follower.value
                                                         )
            if follow_dict:

                try:
                    await self.grab_follow(follow_dict)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Example #17
0
 def grab_html_now(self, url):
     cookies = self.redis_cookie_now.fetch_cookies()
     headers = self.get_header()
     headers['Upgrade-Insecure-Requests'] = '1'
     headers['Proxy-Connection'] = 'keep-alive'
     LOGGER.info('using cookies'+str(cookies))
     ok = True
     while ok:
         resp_text = requests.get(url=url, cookies=cookies['cookies'], verify=False).text
         userjson = json.loads(resp_text)
         # userjson = json.loads(resp_text,'GBK')
         if userjson['ok'] == 1:
             ok = False
     return userjson['data']
Example #18
0
def genStockList(output_file=None, force_update=False):
    if force_update:
        twstock.__update_codes()

    stocks = []
    for code in twstock.codes:
        stock = twstock.codes[code]
        if stock.type == '股票':
            # stocks.append(f'{code},{stock.name}')
            # stocks.append((code, stock.name))
            stocks.append(code)

    LOGGER.info(f'Get {len(stocks)} stocks in list')
    return sorted(stocks)
Example #19
0
    def get_cookie_from_login_sina_com_cn(account, password):
        """ 获取一个账号的Cookie """
        login_url = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)"
        username = base64.b64encode(account.encode("utf-8")).decode("utf-8")
        headers = {
            'Referer':
            'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)',
            'Upgrade-Insecure-Requests':
            '1',
            'Host':
            'login.sina.com.cn',
            'Connection':
            'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        }
        post_data = {
            "entry": "sso",
            "gateway": "1",
            "from": "null",
            "savestate": "30",
            "useticket": "0",
            "pagerefer": "",
            "vsnf": "1",
            "su": username,
            "service": "sso",
            "sp": password,
            "sr": "1440*900",
            "encoding": "UTF-8",
            "cdult": "3",
            "domain": "sina.com.cn",
            "prelt": "0",
            "returntype": "TEXT",
        }
        session = requests.Session()
        r = session.post(login_url,
                         headers=headers,
                         data=post_data,
                         verify=False)
        json_str = r.content.decode("gbk")
        info = json.loads(json_str)
        LOGGER.info('get cookies for %s' % account)
        if info["retcode"] == "0":
            LOGGER.info("Get Cookie Success!( Account:%s )" % account)

            cookies = session.cookies.get_dict()
            for k, v in cookies.items():
                print(k, v)
            return cookies
        else:
            LOGGER.warning("Get Cookie failed!( Account:%s )" % account)
            LOGGER.warning(info)
            return None
Example #20
0
    async def grab_tweet_comments(self, comment_job):
        LOGGER.info('start grab comment: %s' % str(comment_job))
        html_content = await self.grab_html(comment_job['url'])
        comment_html = BeautifulSoup(html_content, "lxml")

        comment_divs = comment_html.find_all(id=re.compile('C_[\d]'),
                                             class_='c')
        for comment_div in comment_divs:
            comment_info = {}
            comment_id = comment_div.get('id')
            user_a = comment_div.find('a')
            if user_a:
                user_href = user_a.get('href')
                if user_href.startswith('/u/'):
                    user_id = user_href[3:]
                else:
                    user_id = await self.get_user_id_from_homepage(
                        self.weibo_host + user_href)
                await self.user_id_in_queue(user_id)
                comment_info['userId'] = user_id
                comment_info['content'] = comment_div.find(
                    class_='ctt').get_text()
                others = comment_div.find(class_='ct').get_text()
                if others:
                    others = others.split('\u6765\u81ea')
                    comment_info['pubTime'] = self.get_time(others[0])
                    if len(others) == 2:
                        comment_info['source'] = others[1]
                comment_info['id'] = comment_id
                comment_info['tweetId'] = comment_job['tweetId']
                comment_info['type'] = 'comment_info'
                await self.weibo_producer.send(comment_info,
                                               comment_job['url'])

        if 'page=' not in comment_job['url']:
            await self.parse_tweet_content(comment_html, comment_job)
            page_div = comment_html.find(id='pagelist')
            if page_div:

                max_page = int(page_div.input.get('value'))
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(
                        JobType.comment.value, {
                            'url':
                            self.tweet_comment_url2 %
                            (comment_job['tweetId'], page),
                            'tweetId':
                            comment_job['tweetId']
                        })
Example #21
0
 def __init__(self, username, password):
     self.YDMApi = windll.LoadLibrary(ROOT_PATH+'\\dll\\yundamaAPI-x64.dll')
     self.appId = 5064  # Èí¼þid
     self.appKey = b'57f477ba2a00eeb3e2fcb392474305d4'  # Èí¼þÃÜÔ¿
     LOGGER.info('app id£º%d\r\napp key£º%s' % (self.appId, self.appKey))
     self.username = username.encode()
     self.password = password.encode()
     print(self.username)
     print(self.password)
     self.code_type = 1005
     self.timeout = 60
     self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey)
     self.uid = self.YDMApi.YDM_Login(self.username, self.password)
     balance = self.YDMApi.YDM_GetBalance(self.username, self.password)
     LOGGER.info('succeed to login in YunDaMa, balance : %d', balance)
Example #22
0
def shutdown():
    LOGGER.info('Stopping http server')
    server.stop()
 
    LOGGER.info('Will Shutdown in %s seconds ...', 60)
    io_loop = tornado.ioloop.IOLoop.instance()
 
    deadline = time.time() + 60
    # remove status file
    del_proc()
    def stop_loop():
        now = time.time()
        if now < deadline and (io_loop._callbacks or io_loop._timeouts):
            io_loop.add_timeout(now + 1, stop_loop)
        else:
            io_loop.stop()
            LOGGER.info('Shutdown')
    stop_loop()
Example #23
0
    async def grab_tweet_repost(self, repost_job_info):
        LOGGER.info('start grab tweet repost: %s' % str(repost_job_info))

        html_content = await self.grab_html(repost_job_info['url'])
        tweet_repost_html = BeautifulSoup(html_content, "lxml")
        repost_divs = tweet_repost_html.find_all(class_='c')
        for div in repost_divs:
            span_cc = div.find('span', class_='cc')
            if span_cc:
                attitube_a = span_cc.find('a')
                if attitube_a:
                    href = attitube_a.get('href')
                    if len(href.split('/')) > 2:
                        await self.redis_job.push_job(
                            JobType.comment.value, {
                                'url':
                                self.tweet_comment_url % href.split('/')[2],
                                'tweetId': href.split('/')[2],
                                'parentTid': repost_job_info['tweetId']
                            })
                        await self.redis_job.push_job(
                            JobType.repost.value, {
                                'url':
                                self.user_repost_url % href.split('/')[2],
                                'tweetId': href.split('/')[2],
                                'parentTid': repost_job_info['tweetId']
                            })
        if 'page=' not in repost_job_info['url']:
            await self.parse_tweet_content(tweet_repost_html, repost_job_info)
            page_div = tweet_repost_html.find(id='pagelist')
            if page_div:

                max_page = int(page_div.input.get('value'))
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(
                        JobType.repost.value, {
                            'url':
                            self.user_repost_url2 %
                            (repost_job_info['tweetId'], page),
                            'tweetId':
                            repost_job_info['tweetId']
                        })
        pass
 async def search_topic_user(self, topic_job_info):
     LOGGER.info('try to grab html: ' + topic_job_info['url'])
     html_content = await self.grab_html(topic_job_info['url'])
     userjson = json.loads(html_content)
     userjson = userjson['data']
     LOGGER.info('succeed to grab html: ' + topic_job_info['url'])
     topic_tweet = {}
     topic_tweet['type'] = 'topic_tweet'
     for group in userjson['cards']:
         if 'show_type' in group:
             for card in group['card_group']:
                 topic_tweet['tweet_time'] = card['mblog']['created_at']
                 topic_tweet['latest_update'] = card['mblog'][
                     'latest_update']
                 topic_tweet['user_id'] = card['mblog']['user']['id']
                 topic_tweet['user_gende'] = card['mblog']['user']['gender']
                 topic_tweet['reposts'] = card['mblog']['reposts_count']
                 topic_tweet['comments'] = card['mblog']['comments_count']
                 await self.weibo_producer.send(topic_tweet,
                                                topic_job_info['url'])
Example #25
0
    def save_verify_code_img(browser, weibo_user):

        screen_shot_path = '.\\img\\%s-screenshot.png' % weibo_user
        code_img_path = '.\\img\\%s-verify_code.png' % weibo_user
        LOGGER.info('get verify code img for %s' % weibo_user)
        browser.save_screenshot(screen_shot_path)
        code_img = browser.find_element_by_xpath(
            '//img[@node-type="verifycode_image"]')
        left = code_img.location['x']
        top = code_img.location['y']
        right = code_img.location['x'] + code_img.size['width']
        bottom = code_img.location['y'] + code_img.size['height']
        # print(left, top, right, bottom)
        picture = Image.open(screen_shot_path)
        # picture = picture.crop((1422, 300, 1533, 334))
        picture = picture.crop((left, top, right, bottom))
        picture.save(code_img_path)
        os.remove(screen_shot_path)
        LOGGER.info('code img saved(%s)' % code_img_path)
        return code_img_path
Example #26
0
    def user_in_queue(cls, user_name):
        r = redis.Redis(connection_pool=cls.redis_pool)

        if not r.sismember('users', user_name):
            LOGGER.info('user in queue: %s' % user_name)
            r.sadd("users", user_name)
        else:

            LOGGER.info('user already in queue: %s' % user_name)
            LOGGER.info("remove it")
            r.srem("users", user_name)
            LOGGER.info('user in queue: %s' % user_name)
            r.sadd("users", user_name)
Example #27
0
    async def crawl_user(self):
        while True:
            user_job_info = await self.redis_job.fetch_job(JobType.user.value)
            if user_job_info:
                try:
                    # asyncio.run_coroutine_threadsafe(self.grab_user_info(user_job_info['user_id']), self.loop)
                    await self.grab_user_info(user_job_info['user_id'])
                    # await self.redis_job.push_job(JobType.tweet.value,
                    #                               {'url': 'https://weibo.cn/' + user_job_info['user_id'],
                    #                                'uid': user_job_info['user_id']})

                    # await self.redis_job.push_job(JobType.follower.value,
                    #                               {'url': self.follow_url % user_job_info['user_id'],
                    #                                'uid': user_job_info['user_id']})
                    # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id})
                    # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id})
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Example #28
0
 async def grab_view(self, user_id):
     """
     获取用户id的微博数、粉丝数、发布的微博数
     :param user_id: 用户id
     :return: dict
     """
     LOGGER.info('grab user view: %s' % str(user_id))
     html_content = await self.grab_html(self.weibo_host + '/' + str(user_id))
     home_page_html = BeautifulSoup(html_content, "lxml")
     v = home_page_html.find('div', class_='tip2')
     result = {}
     if v:
         content = v.get_text(';')
     else:
         content = ''
     tweet_r = re.findall('微博\[(\d+)\];', content)
     result['tweetNum'] = tweet_r[0] if tweet_r else -1
     fans_r = re.findall('粉丝\[(\d+)\];', content)
     result['fansNum'] = fans_r[0] if fans_r else -1
     follow_r = re.findall('关注\[(\d+)\];', content)
     result['followNum'] = follow_r[0] if follow_r else -1
     return result
Example #29
0
    async def crawl_weibo(self):
        r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)')
        while True:
            tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value)
            if tweet_job_info:
                m = r.findall(tweet_job_info['url'])
                if m:
                    page_no = int(m[0][1])
                    if page_no > 200:
                        LOGGER.info('job passed %s' % str(tweet_job_info))
                        continue
                # if 'page=' in tweet_job_info['url']:
                #     LOGGER.info('job passed %s' % str(tweet_job_info))
                #     continue

                try:
                    await self.grab_user_tweet(tweet_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Example #30
0
 async def grab_follow(self, follow_dict):
     LOGGER.info('start grab user follow: %s' % str(follow_dict))
     html_content = await self.grab_html(follow_dict['url'])
     follow_html = BeautifulSoup(html_content, "lxml")
     all_td = follow_html.find_all('td', style=True)
     follow_id = []
     for td in all_td:
         a = td.find('a').get('href')
         usr_id_result = self.user_id_pattern.findall(a)
         if usr_id_result:
             usr_id = usr_id_result[0]
         else:
             usr_id = await self.get_user_id_from_homepage(a)
         if usr_id not in follow_id:
             follow_id.append(int(usr_id))
     user_follow_dict = {}
     follow_id_key_list = [i for i in range(len(follow_id))]
     follow_id = dict(zip(follow_id_key_list, follow_id))
     user_follow_dict['type'] = 'follow'
     user_follow_dict['uid'] = follow_dict['uid']
     user_follow_dict['fans_id'] = follow_id
     await self.weibo_producer.send(user_follow_dict,
                                    self.follow_url % follow_dict['uid'])
     if 'page=' not in follow_dict['url']:
         page_div = follow_html.find(id='pagelist')
         if page_div:
             max_page = int(page_div.input.get('value'))
             if max_page > 20:
                 max_page = 20
             for page in range(2, max_page + 1):
                 await self.redis_job.push_job(
                     JobType.follower.value, {
                         'url': (self.follow_url % follow_dict['uid']) +
                         '?page=' + str(page),
                         'uid':
                         follow_dict['uid']
                     })