def insert_wechat_account(official_account_name): # 使用 cursor() 方法创建一个游标对象 cursor db = connect() cursor = db.cursor() # SQL 查询语句 sql = """INSERT INTO wechat_account_list (official_account_name) VALUES (%s)""" try: # 执行SQL语句 cursor.execute(sql, (official_account_name)) # 获取所有记录列表 sql = "SELECT id FROM wechat_account_list \ WHERE official_account_name = '{}'".format(official_account_name) cursor.execute(sql) db.commit() # 获取所有记录列表 results = cursor.fetchall() cursor.close() db.close() official_account_id = results[0][0] return official_account_id except Exception as e: db.rollback() # 发生错误时回滚 logger.error(str(e)) logger.warning("Failed to add a official account.") return False
def q_save_a_profile(profile): collection = get_collection() try: f = {'user_id': profile['id']} u = { '$set': { 'username': profile['username'], 'name': profile['name'], 'bio': profile['bio'], 'join_datetime': profile['join_datetime'], 'join_date': profile['join_date'], 'join_time': profile['join_time'], 'url': profile['url'], 'location': profile['location'], 'private': profile['private'], 'verified': profile['verified'], 'background_image': profile['background_image'], 'avatar': profile['avatar'], }, '$push': { 'timestamp': datetime.now(), 'followers': int(profile['followers']), 'following': int(profile['following']), 'likes': int(profile['likes']), 'tweets': int(profile['tweets']), 'media': int(profile['media']), } } try: collection.update_one(f, u, upsert=True) except DuplicateKeyError as e: raise except: logger.error(f'Unknown error: {sys.exc_info()[0]}') raise
def run(shop_code): loop = asyncio.get_event_loop() try: login, browser, page, from_store = loop.run_until_complete(LoginTB.run(**STORE_INFO[shop_code])) except Exception as e: logger.error(str(e)) return list_page = page list_page_spider = OrderListPageSpider(login, browser, list_page, from_store) detail_page = loop.run_until_complete(login.new_page()) detail_page_spider = OrderDetailPageSpider(login, browser, detail_page, from_store) link_id_page = loop.run_until_complete(login.new_page()) link_id_spider = OrderDetailLinkIDSpider(login, browser, link_id_page, from_store) delay_order_page = loop.run_until_complete(login.new_page()) delay_order_spider = DelayOrderUpdate(login, browser, delay_order_page, from_store) manager_page = loop.run_until_complete(login.new_page()) item_page = loop.run_until_complete(login.new_page()) manager_page_spider = ItemManagePageSpider(login, browser, manager_page, item_page, from_store) tasks = [ taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider, list_page_spider), # CaptchaCheck.run() ] loop.run_until_complete(asyncio.wait(tasks))
def execute(self): while 1: try: wb = WeiboCrawler(self.using_account, self.uid, self.filter_flag) if wb.crawl(): logger.info('execute weibo crawler success for user %s' % str(self.uid)) break else: logger.info('execute weibo crawler failed for user %s' % str(self.uid)) break except Exception, e: logger.error('execute weibo crawler failed %s with account' % (str(e), self.using_account)) if self.retry: re_choose_account = self.re_choose_using_account() if re_choose_account: self.used_account = re_choose_account else: logger.info('all account tried, execute failed') break else: break
def trans_text(self, sentence: str): """ 使用接口翻译文本,由于可能会失败,因此这里会重连三次 :param sentence: 待翻译的文本 :return: 翻译后的文本 """ text = sentence for i in range(3): try: if self.strict: text = self.translator.translate(sentence, lang_src=self.src, lang_tgt=self.dest) else: text = self.translator.translate(sentence, lang_tgt=self.dest) return text except ConnectionError as ce: logger.error("[复制翻译] 连接失败: %s" % ce) self.translator = self.get_translator() text = str(ce) except Exception as e: logger.error("[复制翻译] 翻译出错: %s" % e) text = str(e) return text
async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider, list_page_spider): page_num = 1 while 1: try: completed = await list_page_spider.get_page(page_num) if completed == 1: page_num += 1 elif completed == 2: MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0}, c={"isDetaildown": 2, "fromStore": from_store}) MySql.cls_update(t="tb_order_spider", set={"isVerify": 0}, c={"isVerify": 2, "fromStore": from_store}) page_num = 1 elif completed == 'exit': break await my_async_sleep(20, random_sleep=True) await link_id_spider.save_link_id() await manager_page_spider.do_it() await detail_page_spider.get_page() exit_loop = await delay_order_spider.get_page() if exit_loop == 'exit': break except Exception as e: logger.error(str(e)) break await browser.close()
async def get_page(self, page_num): await self.page.bringToFront() logger.info("订单列表页爬虫,第 " + str(page_num) + " 页开始") self.completed = 0 try: await self.page.waitForSelector(".pagination-options-go") await self.page.focus(".pagination-options input") for _ in range(3): await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") await self.listening(self.page) await self.page.type(".pagination-options input", str(page_num)) await self.page.keyboard.press("Enter") # await self.page.waitForResponse(self.url) # while self.captcha: # t = await self.login.slider(self.page) # if t: # return t except Exception as e: if re.search('\"\.pagination-options-go\"', str(e)): t = await self.login.slider(self.page) if t: return t else: logger.error(str(e)) while not self.completed: await self.login.slider(self.page) await asyncio.sleep(2) logger.info("订单列表页爬虫,第 " + str(page_num) + " 页完成") await my_async_sleep(15, True) return self.completed
def request_sys(req_url, request_data, method, reqheaders): logger.info("request_data is {0}".format(request_data)) logger.info("headers is {0}".format(reqheaders)) try: if 'GET' == method: result = requests.get(url=req_url, params=request_data, headers=reqheaders) logger.info("res content is {0}".format(result)) return json.loads(result) elif 'POST' == method: reqheaders['Accept'] = 'application/json' if reqheaders.get('Content-Type') == 'application/json': request_data = json.dumps(request_data, cls=APIEncoder) result = requests.post(url=req_url, data=request_data, headers=reqheaders).content logger.info("res content is {0}".format(result)) return json.loads(result) else: logger.info("method error, current method is {0}".format(method)) except Exception as e: logger.error('request_order_sys access error:%s' % (traceback.format_exc(e), )) return None
def get_session(name, password): url, cid, session = do_login(name, password) if url != '': _headers = headers _headers['Host'] = 'passport.weibo.com' # _response = requests.get(url, headers=_headers, verify=False, allow_redirects=False) rs_cont = session.get(url, verify=False, allow_redirects=False) # login_info = rs_cont.text # # u_pattern = r'"uniqueid":"(.*)",' # m = re.search(u_pattern, login_info) if rs_cont.status_code == 302: # 访问微博官方账号看是否正常 check_url = 'http://weibo.com/2671109275/about' # check_url = 'https://weibo.cn/u/1669879400' resp = session.get(check_url) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if rs_cont.status_code == 403: logger.error(u'账号{}已被冻结'.format(name)) # freeze_account(name, 0) return None logger.info(u'本次登陆账号为:{}'.format(name)) return session return None
def search_one_account_passage_by_id(id): # 使用 cursor() 方法创建一个游标对象 cursor db = connect() cursor = db.cursor() # mc = MysqlClient() # SQL 查询语句 sql = "SELECT title, passage_link, official_account_id FROM passage_link_list \ WHERE official_account_id = '{}'".format(id) try: # results = mc.select_many(sql) # 获取所有记录列表 cursor.execute(sql) results = cursor.fetchall() for i in range(len(results)): print(results[i]) # return json.dumps(results[1], ensure_ascii=False) cursor.close() db.close() return results except Exception as e: db.rollback() # 发生错误时回滚 # mc.end() logger.error(str(e)) logger.warning("Failed to search the history passage.") return False
def reload_recursive_ex(module): try: importlib.reload(module) except ImportError as err: logger.exception(err) logger.error("module '{}' could not be reloaded.".format(module)) return if reload_func: for func_names in ls_func_names: if func_names in dir(module): ls_functions = getattr(module, func_names) reload_functions(module, ls_functions) logger.debug("module '{}' reloaded!".format(module.__name__)) for module_child in vars( module).values(): # search subpackages in vars(module) if isinstance(module_child, types.ModuleType): # if it is a module fn_child = getattr(module_child, "__file__", None) if (fn_child is not None ) and fn_child.startswith(fn_dir): # if it is a subpackage if fn_child not in module_visit: # if module has not benn reloaded yet # print("reloading:", fn_child, "from", module) module_visit.add(fn_child) reload_recursive_ex( module_child) # reload subpackages of this module
def _get_user_info(self): # getting user name try: selector = etree.HTML(self.html) self.user_info['userName'] = selector.xpath( '//table//div[@class="ut"]/span/text()')[0] logger.info('user name is %s' % self.user_info['userName']) except Exception as e: logger.error('getting user name failed for:{}'.format(str(e))) # getting user other info try: selector = etree.HTML(self.html) pattern = r"\d+\.?\d*" str_wb = selector.xpath('//span[@class="tc"]/text()')[0] guid = re.findall(pattern, str_wb, re.S | re.M) for value in guid: num_wb = int(value) break self.user_info['weiboNum'] = num_wb str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] guid = re.findall(pattern, str_gz, re.M) self.user_info['following'] = int(guid[0]) str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] guid = re.findall(pattern, str_fs, re.M) self.user_info['followers'] = int(guid[0]) logger.info( 'current user all weibo num {}, following {}, followers {}'. format(self.user_info['weiboNum'], self.user_info['following'], self.user_info['followers'])) except Exception as e: logger.error('getting user info failed for:{}'.format(str(e)))
def reload_functions(module, functions: Union[list, tuple, Callable]): """Reloads a function or a list/tuple of functions.""" if not isinstance(functions, (list, tuple)): functions = [functions] for func in functions: if isinstance( func, (list, tuple)): # format [(_, func1, _), (_, func2, _), ...] fn = func[1] if len(func) > 1 else None else: # format [func1, func2, ...] fn = func if '__name__' in dir(fn): # case of functions and classes name = fn.__name__ if name == '<lambda>': continue elif isinstance( fn, str) and fn in dir(module): # particular case of __all__ list name = fn else: logger.error("object '{}' not reloaded, wrong type or name.") return getattr(module, name) # access to function seems sufficient to reload it. logger.debug("object '{}' reloaded".format(name))
def _handle_duplicates(dico, key, value, flag='first', inplace=False): """Handle duplicates in dico. :param dico: dico to update :param key: key to check :param value: value to set :param flag: 'first', 'last', 'rename' or 'error' (or whatever, which means 'error') :param inplace: modification of dico inplace if True :return: None if inplace is True, else dico updated """ n_dico = type(dico)() if key in dico: logger.debug("Key '{}' is duplicated.".format(key)) if flag == 'first': pass elif flag == 'last': n_dico[key] = value elif flag == 'rename': i = 0 exists = True while exists: i += 1 n_key = "{}_{}".format(key, i) exists = True if n_key in dico else False n_dico[n_key] = value else: err_msg = "Duplicate keys '{}' found! Conversion process aborting." logger.error(err_msg) raise ValueError(err_msg) else: n_dico[key] = value if inplace: dico.update(n_dico) return return n_dico
def scrape_a_user_profile(self, username): # Todo: proxy stats log_scraping_profile(self.session_id, 'begin', 'profile', username) time.sleep( random() * 5 ) # Todo: DOESN'T WORKOtherwise other objects in other processes will also start checking proxies populated, filling the queue with the same proxies self._check_proxy_queue() fail_counter = 0 while fail_counter < self.max_fails: proxy = self.proxy_queue.get() profile_scraper = ProfileScraper(self.c) profile_scraper.proxy_server = proxy logger.info( f'Start scraping profiles | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) # Todo: When I don't add raise to the get.py / def User(...) / line 197, then fail silently. # No distinction between existing user with proxy failure and canceled account. # When I add raise, twint / asyncio show error traceback in terminal # ? What happens with proxies when username is canceled? Sometimes TimeoutError or TypeError try: # Todo: Refactor: make method and use also in scrape_a_user_tweets profile_df = profile_scraper.execute_scraping(username) except: print('x' * 100) print(sys.exc_info()[0]) print(sys.exc_info()) print('x' * 100) raise else: if profile_df.empty: # ProfileScrapingError logger.error( f'Empty profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) update_proxy_stats('ProfileScrapingError', proxy) fail_counter += 1 time.sleep(random() * 5) else: # ok logger.info( f'Saving profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) log_scraping_profile(self.session_id, 'ok', 'profile', username, proxy=proxy) save_a_profile(profile_df) update_proxy_stats('ok', proxy) self._release_proxy_server(proxy) break finally: if fail_counter >= self.max_fails: # Dead txt = f'dead | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' logger.error(txt) log_scraping_profile(self.session_id, 'dead', f'profile', username, proxy=proxy) log_scraping_profile(self.session_id, 'end', 'profile', username)
def firefox(): try: driver = webdriver.Firefox(executable_path=FIREFOX_DRIVER) driver.maximize_window() driver.close() logger.info('firefox driver ok') except Exception as e: logger.error(f'firefox driver failed; {e}')
def chrome(): try: driver = webdriver.Chrome(executable_path=CHROME_DRIVER) driver.maximize_window() driver.close() logger.info('chrome driver ok') except Exception as e: logger.error(f'chrome driver failed; error code: {e}')
def request(self): try: while True: data = self.queue.get() logger.info("get a data") self.route(data) except: logger.error(traceback.format_exc())
def save_error_graph(plot_path, err_msg=None, **save_file_kwargs): if err_msg is None: err_msg = 'Unknown error.' logger.error("Plot of {} failed. Error: '{}'".format(plot_path, err_msg)) _fig = plt.figure(figsize=(21.28, 12)) plt.text(0.35, 0.5, 'Error on this graph', dict(size=30)) output_path = save_plot(plot_path, **save_file_kwargs) return output_path
def ie(): try: driver = webdriver.Ie(executable_path=IE_DRIVER) driver.maximize_window() driver.close() logger.info('ie driver ok') except Exception as e: logger.error(f'ie driver failed; {e}')
def get_api_data(url: str, data: dict = {}): try: answer = requests.get(url=FUNCTIONS_API_URL + url, json=data) json_answer = answer.json() except Exception as e: logger.error(e) error = APIError(error_msg=e) return error return json_answer
def q_save_a_tweet(tweet): collection = get_collection() try: result = collection.insert_one(tweet) except DuplicateKeyError as e: logger.debug(f"Duplicate: {tweet['tweet_id']} - {tweet['date']} - {tweet['name']}") except: logger.error(f'Unknown error: {sys.exc_info()[0]}') raise
def connect(): # 连接数据库 try: db = pymysql.connect(host=db_host, port=db_port, user=db_user, password=db_password, db=db_name) return db except Exception as e: logger.error(str(e)) logger.error("Database connect failed, please chech the database configuration.") exit()
async def intercept_response(self, res): if re.search(r'https://item.taobao.com/item.htm', res.url): try: content = await res.text() except errors.NetworkError: logger.error("网络出错了,没有解析内容,重新请求") await self._goto_the_next() else: await self.parse(content)
def scrape_a_user_tweets(self, username, session_begin_date, session_end_date): log_scraping_tweets(self.session_id, 'begin', 'session', username, self.session_begin_date, self.session_end_date) self._check_proxy_queue() periods_to_scrape = self._calculate_scrape_periods(username, session_begin_date, session_end_date) for period_begin_date, period_end_date in periods_to_scrape: fail_counter = 0 while fail_counter < self.max_fails: proxy = self.proxy_queue.get() logger.info( f'Start scraping tweets | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}') tweet_scraper = TweetScraper(username, period_begin_date, period_end_date) tweet_scraper.proxy_server = proxy try: tweets_df = tweet_scraper.execute_scraping() except ValueError as e: fail_counter += 1 self.handle_error('ValueError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except ServerDisconnectedError as e: fail_counter += 1 self.handle_error('ServerDisconnectedError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except ClientOSError as e: fail_counter += 1 self.handle_error('ClientOSError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except TimeoutError as e: fail_counter += 1 self.handle_error('TimeoutError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except ClientHttpProxyError as e: fail_counter += 1 self.handle_error('ClientHttpProxyError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except IndexError as e: fail_counter += 1 self.handle_error('IndexError', e, username, period_begin_date, period_end_date, proxy, fail_counter) except Empty as e: # Queue emprt logger.error( f'Empty Error | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}') self._populate_proxy_queue() except: print('x' * 3000) print(sys.exc_info()[0]) print(sys.exc_info()) else: logger.info( f'Saving {len(tweets_df)} tweets | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}') if not tweets_df.empty: save_tweets(tweets_df) log_scraping_tweets(self.session_id, 'ok', 'period', username, period_begin_date, end_date=period_end_date, n_tweets=len(tweets_df), proxy=proxy) update_proxy_stats('ok', proxy) break # the wile-loop finally: self._release_proxy_server(proxy) if fail_counter >= self.max_fails: txt = f'FAIL | {username}, {period_begin_date} | {period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' logger.error(txt) log_scraping_tweets(self.session_id, 'fail', 'period', username, period_begin_date, period_end_date, proxy=proxy) # All periods scraped. log_scraping_tweets(self.session_id, 'end', 'session', username, self.session_begin_date, self.session_end_date)
def killServer(): try: with open(opt_config["pid-file"]) as f: buf = f.read() pid = daemon.to_str(buf) pid = int(pid) os.kill(pid, signal.SIGTREM) except: logger.error("kill server error") logger.error(traceback.format_exc())
def find_docs(query, col_name): col_session = build_db_col_session(DB_NAME, col_name, DB_URL) try: find_doc = col_session.find(query) return find_doc except Exception as e: logger.error(str(e)) logger.error("查询失败。") print(str(e)) print("查询失败。")
def insert_docs(docs_list, passage_link, col_name=COL_NAME): _, col_session = build_db_col_session(DB_NAME, col_name, DB_URL) try: filter_name = {"passage_link": passage_link} col_session.update_many(filter=filter_name, update=docs_list, upsert=True) except Exception as e: logger.error(str(e)) logger.error("Failed to update passages。") print(str(e)) print("Failed to update passages。")
def get_download_url(self, asset_list, asset_type): download_url = None try: download_url = asset_list[asset_type]['location'] except KeyError: logger.error('analytic not available') except Exception as exc: logger.error(exc) status = (download_url is not None) return download_url, status
def find_docs(query, col_name=COL_NAME): _, col_session = build_db_col_session(DB_NAME, col_name, DB_URL) try: find_doc = col_session.find(query).sort([("passage_create_time", -1)]) return find_doc except Exception as e: logger.error(str(e)) logger.error("Failed to search passages.") print(str(e)) print("Failed to search passages.")