Ejemplo n.º 1
0
def insert_wechat_account(official_account_name):
    # 使用 cursor() 方法创建一个游标对象 cursor
    db = connect()
    cursor = db.cursor()
    # SQL 查询语句
    sql = """INSERT INTO wechat_account_list (official_account_name) VALUES (%s)"""
    try:
        # 执行SQL语句
        cursor.execute(sql, (official_account_name))
        # 获取所有记录列表
        sql = "SELECT id FROM wechat_account_list \
                   WHERE  official_account_name = '{}'".format(official_account_name)
        cursor.execute(sql)
        db.commit()
        # 获取所有记录列表
        results = cursor.fetchall()
        cursor.close()
        db.close()
        official_account_id = results[0][0]
        return official_account_id
    except Exception as e:
        db.rollback()  # 发生错误时回滚
        logger.error(str(e))
        logger.warning("Failed to add a official account.")
        return False
Ejemplo n.º 2
0
def q_save_a_profile(profile):
    collection = get_collection()
    try:
        f = {'user_id': profile['id']}
        u = {
            '$set': {
                'username': profile['username'],
                'name': profile['name'],
                'bio': profile['bio'],
                'join_datetime': profile['join_datetime'],
                'join_date': profile['join_date'],
                'join_time': profile['join_time'],
                'url': profile['url'],
                'location': profile['location'],
                'private': profile['private'],
                'verified': profile['verified'],
                'background_image': profile['background_image'],
                'avatar': profile['avatar'],
            },
            '$push': {
                'timestamp': datetime.now(),
                'followers': int(profile['followers']),
                'following': int(profile['following']),
                'likes': int(profile['likes']),
                'tweets': int(profile['tweets']),
                'media': int(profile['media']),
            }
        }
        try:
            collection.update_one(f, u, upsert=True)
        except DuplicateKeyError as e:
            raise
    except:
        logger.error(f'Unknown error: {sys.exc_info()[0]}')
        raise
Ejemplo n.º 3
0
def run(shop_code):
    loop = asyncio.get_event_loop()
    try:
        login, browser, page, from_store = loop.run_until_complete(LoginTB.run(**STORE_INFO[shop_code]))
    except Exception as e:
        logger.error(str(e))
        return

    list_page = page
    list_page_spider = OrderListPageSpider(login, browser, list_page, from_store)

    detail_page = loop.run_until_complete(login.new_page())
    detail_page_spider = OrderDetailPageSpider(login, browser, detail_page, from_store)

    link_id_page = loop.run_until_complete(login.new_page())
    link_id_spider = OrderDetailLinkIDSpider(login, browser, link_id_page, from_store)

    delay_order_page = loop.run_until_complete(login.new_page())
    delay_order_spider = DelayOrderUpdate(login, browser, delay_order_page, from_store)

    manager_page = loop.run_until_complete(login.new_page())
    item_page = loop.run_until_complete(login.new_page())
    manager_page_spider = ItemManagePageSpider(login, browser, manager_page, item_page, from_store)
    tasks = [
        taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider,
               list_page_spider),
        # CaptchaCheck.run()
    ]
    loop.run_until_complete(asyncio.wait(tasks))
Ejemplo n.º 4
0
 def execute(self):
     while 1:
         try:
             wb = WeiboCrawler(self.using_account, self.uid,
                               self.filter_flag)
             if wb.crawl():
                 logger.info('execute weibo crawler success for user %s' %
                             str(self.uid))
                 break
             else:
                 logger.info('execute weibo crawler failed for user %s' %
                             str(self.uid))
                 break
         except Exception, e:
             logger.error('execute weibo crawler failed %s with account' %
                          (str(e), self.using_account))
             if self.retry:
                 re_choose_account = self.re_choose_using_account()
                 if re_choose_account:
                     self.used_account = re_choose_account
                 else:
                     logger.info('all account tried, execute failed')
                     break
             else:
                 break
Ejemplo n.º 5
0
 def trans_text(self, sentence: str):
     """
     使用接口翻译文本,由于可能会失败,因此这里会重连三次
     :param sentence: 待翻译的文本
     :return: 翻译后的文本
     """
     text = sentence
     for i in range(3):
         try:
             if self.strict:
                 text = self.translator.translate(sentence,
                                                  lang_src=self.src,
                                                  lang_tgt=self.dest)
             else:
                 text = self.translator.translate(sentence,
                                                  lang_tgt=self.dest)
             return text
         except ConnectionError as ce:
             logger.error("[复制翻译] 连接失败: %s" % ce)
             self.translator = self.get_translator()
             text = str(ce)
         except Exception as e:
             logger.error("[复制翻译] 翻译出错: %s" % e)
             text = str(e)
     return text
Ejemplo n.º 6
0
async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider,
                 list_page_spider):
    page_num = 1
    while 1:
        try:
            completed = await list_page_spider.get_page(page_num)
            if completed == 1:
                page_num += 1
            elif completed == 2:
                MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0},
                                 c={"isDetaildown": 2, "fromStore": from_store})
                MySql.cls_update(t="tb_order_spider", set={"isVerify": 0},
                                 c={"isVerify": 2, "fromStore": from_store})
                page_num = 1
            elif completed == 'exit':
                break
            await my_async_sleep(20, random_sleep=True)
            await link_id_spider.save_link_id()
            await manager_page_spider.do_it()
            await detail_page_spider.get_page()
            exit_loop = await delay_order_spider.get_page()
            if exit_loop == 'exit':
                break
        except Exception as e:
            logger.error(str(e))
            break
    await browser.close()
 async def get_page(self, page_num):
     await self.page.bringToFront()
     logger.info("订单列表页爬虫,第 " + str(page_num) + " 页开始")
     self.completed = 0
     try:
         await self.page.waitForSelector(".pagination-options-go")
         await self.page.focus(".pagination-options input")
         for _ in range(3):
             await self.page.keyboard.press("Delete")
             await self.page.keyboard.press("Backspace")
         await self.listening(self.page)
         await self.page.type(".pagination-options input", str(page_num))
         await self.page.keyboard.press("Enter")
         # await self.page.waitForResponse(self.url)
         # while self.captcha:
         #     t = await self.login.slider(self.page)
         #     if t:
         #         return t
     except Exception as e:
         if re.search('\"\.pagination-options-go\"', str(e)):
             t = await self.login.slider(self.page)
             if t:
                 return t
         else:
             logger.error(str(e))
     while not self.completed:
         await self.login.slider(self.page)
         await asyncio.sleep(2)
     logger.info("订单列表页爬虫,第 " + str(page_num) + " 页完成")
     await my_async_sleep(15, True)
     return self.completed
Ejemplo n.º 8
0
def request_sys(req_url, request_data, method, reqheaders):
    logger.info("request_data is {0}".format(request_data))
    logger.info("headers is {0}".format(reqheaders))
    try:
        if 'GET' == method:
            result = requests.get(url=req_url,
                                  params=request_data,
                                  headers=reqheaders)
            logger.info("res content is {0}".format(result))
            return json.loads(result)
        elif 'POST' == method:
            reqheaders['Accept'] = 'application/json'
            if reqheaders.get('Content-Type') == 'application/json':
                request_data = json.dumps(request_data, cls=APIEncoder)
            result = requests.post(url=req_url,
                                   data=request_data,
                                   headers=reqheaders).content
            logger.info("res content is {0}".format(result))
            return json.loads(result)
        else:
            logger.info("method error, current method is {0}".format(method))
    except Exception as e:
        logger.error('request_order_sys access error:%s' %
                     (traceback.format_exc(e), ))
    return None
Ejemplo n.º 9
0
def get_session(name, password):
    url, cid, session = do_login(name, password)

    if url != '':
        _headers = headers
        _headers['Host'] = 'passport.weibo.com'
        # _response = requests.get(url, headers=_headers, verify=False, allow_redirects=False)
        rs_cont = session.get(url, verify=False, allow_redirects=False)
        # login_info = rs_cont.text
        #
        # u_pattern = r'"uniqueid":"(.*)",'
        # m = re.search(u_pattern, login_info)
        if rs_cont.status_code == 302:
            # 访问微博官方账号看是否正常
            check_url = 'http://weibo.com/2671109275/about'
            # check_url = 'https://weibo.cn/u/1669879400'
            resp = session.get(check_url)
            # 通过实验,目前发现未经过手机验证的账号是救不回来了...
            if rs_cont.status_code == 403:
                logger.error(u'账号{}已被冻结'.format(name))
                # freeze_account(name, 0)
                return None
            logger.info(u'本次登陆账号为:{}'.format(name))
            return session

    return None
Ejemplo n.º 10
0
def search_one_account_passage_by_id(id):
    # 使用 cursor() 方法创建一个游标对象 cursor
    db = connect()
    cursor = db.cursor()
    # mc = MysqlClient()
    # SQL 查询语句
    sql = "SELECT title, passage_link, official_account_id FROM passage_link_list \
              WHERE  official_account_id = '{}'".format(id)
    try:
        # results = mc.select_many(sql)
        # 获取所有记录列表
        cursor.execute(sql)
        results = cursor.fetchall()
        for i in range(len(results)):
           print(results[i])
        # return json.dumps(results[1], ensure_ascii=False)
        cursor.close()
        db.close()
        return results
    except Exception as e:
        db.rollback()  # 发生错误时回滚
        # mc.end()
        logger.error(str(e))
        logger.warning("Failed to search the history passage.")
        return False
Ejemplo n.º 11
0
    def reload_recursive_ex(module):
        try:
            importlib.reload(module)
        except ImportError as err:
            logger.exception(err)
            logger.error("module '{}' could not be reloaded.".format(module))
            return
        if reload_func:
            for func_names in ls_func_names:
                if func_names in dir(module):
                    ls_functions = getattr(module, func_names)
                    reload_functions(module, ls_functions)
        logger.debug("module '{}' reloaded!".format(module.__name__))

        for module_child in vars(
                module).values():  # search subpackages in vars(module)
            if isinstance(module_child, types.ModuleType):  # if it is a module
                fn_child = getattr(module_child, "__file__", None)
                if (fn_child is not None
                    ) and fn_child.startswith(fn_dir):  # if it is a subpackage
                    if fn_child not in module_visit:  # if module has not benn reloaded yet
                        # print("reloading:", fn_child, "from", module)
                        module_visit.add(fn_child)
                        reload_recursive_ex(
                            module_child)  # reload subpackages of this module
Ejemplo n.º 12
0
    def _get_user_info(self):
        # getting user name
        try:
            selector = etree.HTML(self.html)
            self.user_info['userName'] = selector.xpath(
                '//table//div[@class="ut"]/span/text()')[0]
            logger.info('user name is %s' % self.user_info['userName'])
        except Exception as e:
            logger.error('getting user name failed for:{}'.format(str(e)))

        # getting user other info
        try:
            selector = etree.HTML(self.html)
            pattern = r"\d+\.?\d*"
            str_wb = selector.xpath('//span[@class="tc"]/text()')[0]
            guid = re.findall(pattern, str_wb, re.S | re.M)
            for value in guid:
                num_wb = int(value)
                break
            self.user_info['weiboNum'] = num_wb

            str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
            guid = re.findall(pattern, str_gz, re.M)
            self.user_info['following'] = int(guid[0])

            str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
            guid = re.findall(pattern, str_fs, re.M)
            self.user_info['followers'] = int(guid[0])
            logger.info(
                'current user all weibo num {}, following {}, followers {}'.
                format(self.user_info['weiboNum'], self.user_info['following'],
                       self.user_info['followers']))
        except Exception as e:
            logger.error('getting user info failed for:{}'.format(str(e)))
Ejemplo n.º 13
0
def reload_functions(module, functions: Union[list, tuple, Callable]):
    """Reloads a function or a list/tuple of functions."""
    if not isinstance(functions, (list, tuple)):
        functions = [functions]
    for func in functions:
        if isinstance(
                func,
            (list, tuple)):  # format  [(_, func1, _), (_, func2, _), ...]
            fn = func[1] if len(func) > 1 else None
        else:  # format [func1, func2, ...]
            fn = func
        if '__name__' in dir(fn):  # case of functions and classes
            name = fn.__name__
            if name == '<lambda>':
                continue
        elif isinstance(
                fn,
                str) and fn in dir(module):  # particular case of __all__ list
            name = fn
        else:
            logger.error("object '{}' not reloaded, wrong type or name.")
            return
        getattr(module,
                name)  # access to function seems sufficient to reload it.
        logger.debug("object '{}' reloaded".format(name))
Ejemplo n.º 14
0
def _handle_duplicates(dico, key, value, flag='first', inplace=False):
    """Handle duplicates in dico.

    :param dico: dico to update
    :param key: key to check
    :param value: value to set
    :param flag: 'first', 'last', 'rename' or 'error' (or whatever, which means 'error')
    :param inplace: modification of dico inplace if True
    :return: None if inplace is True, else dico updated
    """
    n_dico = type(dico)()
    if key in dico:
        logger.debug("Key '{}' is duplicated.".format(key))
        if flag == 'first':
            pass
        elif flag == 'last':
            n_dico[key] = value
        elif flag == 'rename':
            i = 0
            exists = True
            while exists:
                i += 1
                n_key = "{}_{}".format(key, i)
                exists = True if n_key in dico else False
            n_dico[n_key] = value
        else:
            err_msg = "Duplicate keys '{}' found! Conversion process aborting."
            logger.error(err_msg)
            raise ValueError(err_msg)
    else:
        n_dico[key] = value
    if inplace:
        dico.update(n_dico)
        return
    return n_dico
Ejemplo n.º 15
0
    def scrape_a_user_profile(self, username):  # Todo:  proxy stats
        log_scraping_profile(self.session_id, 'begin', 'profile', username)
        time.sleep(
            random() * 5
        )  # Todo: DOESN'T WORKOtherwise other objects in other processes will also start checking proxies populated, filling the queue with the same proxies
        self._check_proxy_queue()
        fail_counter = 0
        while fail_counter < self.max_fails:
            proxy = self.proxy_queue.get()
            profile_scraper = ProfileScraper(self.c)
            profile_scraper.proxy_server = proxy
            logger.info(
                f'Start scraping profiles | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}'
            )
            # Todo: When I don't add raise to the get.py / def User(...) / line 197, then fail silently.
            #       No distinction between existing user with proxy failure and canceled account.
            #       When I add raise, twint / asyncio show  error traceback in terminal
            #       ? What happens with proxies when username is canceled? Sometimes TimeoutError or TypeError
            try:  # Todo: Refactor: make method and use also in scrape_a_user_tweets

                profile_df = profile_scraper.execute_scraping(username)

            except:
                print('x' * 100)
                print(sys.exc_info()[0])
                print(sys.exc_info())
                print('x' * 100)
                raise
            else:
                if profile_df.empty:  # ProfileScrapingError
                    logger.error(
                        f'Empty profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}'
                    )
                    update_proxy_stats('ProfileScrapingError', proxy)
                    fail_counter += 1
                    time.sleep(random() * 5)
                else:  # ok
                    logger.info(
                        f'Saving profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}'
                    )
                    log_scraping_profile(self.session_id,
                                         'ok',
                                         'profile',
                                         username,
                                         proxy=proxy)
                    save_a_profile(profile_df)
                    update_proxy_stats('ok', proxy)
                    self._release_proxy_server(proxy)
                    break
            finally:
                if fail_counter >= self.max_fails:  # Dead
                    txt = f'dead | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}'
                    logger.error(txt)
                    log_scraping_profile(self.session_id,
                                         'dead',
                                         f'profile',
                                         username,
                                         proxy=proxy)
        log_scraping_profile(self.session_id, 'end', 'profile', username)
Ejemplo n.º 16
0
 def firefox():
     try:
         driver = webdriver.Firefox(executable_path=FIREFOX_DRIVER)
         driver.maximize_window()
         driver.close()
         logger.info('firefox driver ok')
     except Exception as e:
         logger.error(f'firefox driver failed; {e}')
Ejemplo n.º 17
0
 def chrome():
     try:
         driver = webdriver.Chrome(executable_path=CHROME_DRIVER)
         driver.maximize_window()
         driver.close()
         logger.info('chrome driver ok')
     except Exception as e:
         logger.error(f'chrome driver failed;  error code: {e}')
Ejemplo n.º 18
0
 def request(self):
     try:
         while True:
             data = self.queue.get()
             logger.info("get a data")
             self.route(data)
     except:
         logger.error(traceback.format_exc())
Ejemplo n.º 19
0
def save_error_graph(plot_path, err_msg=None, **save_file_kwargs):
    if err_msg is None:
        err_msg = 'Unknown error.'
    logger.error("Plot of {} failed. Error: '{}'".format(plot_path, err_msg))
    _fig = plt.figure(figsize=(21.28, 12))
    plt.text(0.35, 0.5, 'Error on this graph', dict(size=30))
    output_path = save_plot(plot_path, **save_file_kwargs)
    return output_path
Ejemplo n.º 20
0
 def ie():
     try:
         driver = webdriver.Ie(executable_path=IE_DRIVER)
         driver.maximize_window()
         driver.close()
         logger.info('ie driver ok')
     except Exception as e:
         logger.error(f'ie driver failed; {e}')
Ejemplo n.º 21
0
def get_api_data(url: str, data: dict = {}):
    try:
        answer = requests.get(url=FUNCTIONS_API_URL + url, json=data)
        json_answer = answer.json()
    except Exception as e:
        logger.error(e)
        error = APIError(error_msg=e)
        return error
    return json_answer
Ejemplo n.º 22
0
def q_save_a_tweet(tweet):
    collection = get_collection()
    try:
        result = collection.insert_one(tweet)
    except DuplicateKeyError as e:
        logger.debug(f"Duplicate: {tweet['tweet_id']} - {tweet['date']} - {tweet['name']}")
    except:
        logger.error(f'Unknown error: {sys.exc_info()[0]}')
        raise
Ejemplo n.º 23
0
def connect():
    # 连接数据库
    try:
        db = pymysql.connect(host=db_host, port=db_port, user=db_user, password=db_password, db=db_name)
        return db
    except Exception as e:
        logger.error(str(e))
        logger.error("Database connect failed, please chech the database configuration.")
        exit()
Ejemplo n.º 24
0
 async def intercept_response(self, res):
     if re.search(r'https://item.taobao.com/item.htm', res.url):
         try:
             content = await res.text()
         except errors.NetworkError:
             logger.error("网络出错了,没有解析内容,重新请求")
             await self._goto_the_next()
         else:
             await self.parse(content)
Ejemplo n.º 25
0
    def scrape_a_user_tweets(self, username, session_begin_date, session_end_date):
        log_scraping_tweets(self.session_id, 'begin', 'session', username, self.session_begin_date, self.session_end_date)
        self._check_proxy_queue()
        periods_to_scrape = self._calculate_scrape_periods(username, session_begin_date, session_end_date)
        for period_begin_date, period_end_date in periods_to_scrape:
            fail_counter = 0
            while fail_counter < self.max_fails:
                proxy = self.proxy_queue.get()
                logger.info(
                    f'Start scraping tweets | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}')
                tweet_scraper = TweetScraper(username, period_begin_date, period_end_date)
                tweet_scraper.proxy_server = proxy
                try:
                    tweets_df = tweet_scraper.execute_scraping()
                except ValueError as e:
                    fail_counter += 1
                    self.handle_error('ValueError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except ServerDisconnectedError as e:
                    fail_counter += 1
                    self.handle_error('ServerDisconnectedError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except ClientOSError as e:
                    fail_counter += 1
                    self.handle_error('ClientOSError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except TimeoutError as e:
                    fail_counter += 1
                    self.handle_error('TimeoutError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except ClientHttpProxyError as e:
                    fail_counter += 1
                    self.handle_error('ClientHttpProxyError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except IndexError as e:
                    fail_counter += 1
                    self.handle_error('IndexError', e, username, period_begin_date, period_end_date, proxy, fail_counter)
                except Empty as e:  # Queue emprt
                    logger.error(
                        f'Empty Error | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}')
                    self._populate_proxy_queue()
                except:
                    print('x' * 3000)
                    print(sys.exc_info()[0])
                    print(sys.exc_info())
                else:
                    logger.info(
                        f'Saving {len(tweets_df)} tweets | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}')
                    if not tweets_df.empty: save_tweets(tweets_df)
                    log_scraping_tweets(self.session_id, 'ok', 'period', username, period_begin_date, end_date=period_end_date, n_tweets=len(tweets_df), proxy=proxy)
                    update_proxy_stats('ok', proxy)
                    break  # the wile-loop
                finally:
                    self._release_proxy_server(proxy)
                    if fail_counter >= self.max_fails:
                        txt = f'FAIL | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}'
                        logger.error(txt)
                        log_scraping_tweets(self.session_id, 'fail', 'period', username, period_begin_date, period_end_date, proxy=proxy)

        # All periods scraped.
        log_scraping_tweets(self.session_id, 'end', 'session', username, self.session_begin_date, self.session_end_date)
Ejemplo n.º 26
0
def killServer():
	try:
		with open(opt_config["pid-file"]) as f:
			buf = f.read()
			pid = daemon.to_str(buf)
			pid = int(pid)
			os.kill(pid, signal.SIGTREM)
	except:
		logger.error("kill server error")
		logger.error(traceback.format_exc())
Ejemplo n.º 27
0
def find_docs(query, col_name):
    col_session = build_db_col_session(DB_NAME, col_name, DB_URL)
    try:
        find_doc = col_session.find(query)
        return find_doc
    except Exception as e:
        logger.error(str(e))
        logger.error("查询失败。")
        print(str(e))
        print("查询失败。")
Ejemplo n.º 28
0
def insert_docs(docs_list, passage_link, col_name=COL_NAME):
    _, col_session = build_db_col_session(DB_NAME, col_name, DB_URL)
    try:
        filter_name = {"passage_link": passage_link}
        col_session.update_many(filter=filter_name, update=docs_list, upsert=True)
    except Exception as e:
        logger.error(str(e))
        logger.error("Failed to update passages。")
        print(str(e))
        print("Failed to update passages。")
Ejemplo n.º 29
0
 def get_download_url(self, asset_list, asset_type):
     download_url = None
     try:
         download_url = asset_list[asset_type]['location']
     except KeyError:
         logger.error('analytic not available')
     except Exception as exc:
         logger.error(exc)
     status = (download_url is not None)
     return download_url, status
Ejemplo n.º 30
0
def find_docs(query, col_name=COL_NAME):
    _, col_session = build_db_col_session(DB_NAME, col_name, DB_URL)
    try:
        find_doc = col_session.find(query).sort([("passage_create_time", -1)])
        return find_doc
    except Exception as e:
        logger.error(str(e))
        logger.error("Failed to search passages.")
        print(str(e))
        print("Failed to search passages.")