def __init__(self): self._ha = HttpAccess() cookies = '_RF1=101.204.79.78; _RSG=7t4K9DysapAAy3T6IzZvP9; _RDG=28c3a46e16bd9527e206056f639f93f12d; _RGUID=ace4dbc3-4950-4dc7-9679-8fd486743f0a; ASP.NET_SessionSvc=MTAuOC4xODkuNTV8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU0NzYzNTY5NDYxNA; bdshare_firstime=1550397871920; MKT_Pagesource=PC; _ga=GA1.2.1090470229.1550397875; _gid=GA1.2.111071048.1550397875; _bfa=1.1550397832747.3uetxn.1.1550397832747.1550397832747.1.4; _bfs=1.4; gad_city=be2e953e1ae09d16d9cc90a550611388; __zpspc=9.1.1550397884.1550397884.1%234%7C%7C%7C%7C%7C%23; _jzqco=%7C%7C%7C%7C1550397884384%7C1.1018365145.1550397884256.1550397884256.1550397884256.1550397884256.1550397884256.0.0.0.1.1; _bfi=p1%3D290510%26p2%3D290546%26v1%3D4%26v2%3D3; appFloatCnt=3' self._ha._managedCookie.add_cookies('ctrip.com', cookies) self.page_url = queue.Queue() self.que_dealing = [] # 当前文件夹 self.filepath = Path(__file__).parents[0]
def _get_cms_ver(self, host: str, path: str, rgx: re.Pattern): ver: str = None try: ha = HttpAccess() # access home page to get cookie url = host if not url.startswith("http"): url = "http://" + host.strip("/") self._logger.debug("Get CMS ver home: {}".format(url)) ha.getstring( url, headers=""" Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Encoding: gzip, deflate Accept-Language: en-US,en;q=0.9 Cache-Control: no-cache Pragma: no-cache Proxy-Connection: keep-alive Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36""", timeout=10, ) # access version page url = host.strip("/") + "/" + path.lstrip("/") if not url.startswith("http"): url = "http://" + host.strip("/") + "/" + path.lstrip("/") self._logger.debug("Get CMS ver subpath: {}".format(url)) html = ha.getstring( url, headers=""" Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Encoding: gzip, deflate Accept-Language: en-US,en;q=0.9 Cache-Control: no-cache Pragma: no-cache Proxy-Connection: keep-alive Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36""", timeout=10, ) if html is None or html == "": return ver # <version>(.+)</version> m: re.Match = re.search(rgx, html, re.S) if m is None: return ver ver = m.group(1) except Exception as e: self._logger.error("Get joomla version faile: {} {}".format(host, e.args)) return ver
def __init__(self): # self.s = requests.session() self.s = HttpAccess() self.tieba_keyword = '四川' self.tiezi_keyword = ['四川', '德阳'] start_cookie = 'TIEBA_USERTYPE=8f42a94301cb125114b88e7c; wise_device=0; BAIDUID=CB7173B0D9165F60AF77E8ACE3C20897:FG=1; bdshare_firstime=1551248833930; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1551248834; BDUSS=BBdHZRVnhYfnB3aGRKdUViVW9-QXFCUkVJVFUyNWdyUVRMUUpOeWxaU1oyWjFjQUFBQUFBJCQAAAAAAAAAAAEAAAA23WE5yq7UwnNlcHRlbWJlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJlMdlyZTHZcQV; STOKEN=621f6ba3aa1e26cbad20ecfe531ea78659a0ec1878489146ad833b226ce9e2fa; TIEBAUID=f986682cc736e76dfd7f2ee8; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1551258762' self.s._managedCookie.add_cookies('tieba.baidu.com', start_cookie) self.content_url_queue = queue.Queue() self.dealing_queue = [] # 当前文件夹 self.filepath = Path(__file__).parents[0] self.tiebahost = 'https://tieba.baidu.com'
def __init__(self): ProxySpiderbase.__init__(self, False) self._loggr: MsLogger = MsLogManager.get_logger( self.__class__.__name__) # 慢点,interval=1表示1秒/次请求 self._ha: HttpAccess = HttpAccess(interval=1)
def __init__(self): ProxySpiderbase.__init__(self, False) self._logger: MsLogger = MsLogManager.get_logger( self.__class__.__name__) self._reproxy = re.compile(r"([\d.]+?):(\d+)", re.S) # 用于验证HTTP代理的,http访问器;interval两个HTTP请求之间的间隔时间(秒) self._ha: HttpAccess = HttpAccess(interval=1)
def __init__(self): ProxySpiderbase.__init__(self, False) self._logger: MsLogger = MsLogManager.get_logger(self.__class__.__name__) self._reproxy = re.compile(r'"([\d.]+?):(\d+)"', re.S) # 用于验证HTTP代理的,http访问器;interval两个HTTP请求之间的间隔时间(秒) self._ha: HttpAccess = HttpAccess(interval=1) # 付费齐云代理的key self.key = 'dd0b192e8199af0b47faf005aac4483b1efff860'
def _get_accesstoken(cls): """ 首先需要账号登陆获取token :return: """ # res = False _user = zoomeyeconf.get('username') _password = zoomeyeconf.get('password') try: url = "https://api.zoomeye.org/user/login" postdata = f'{{"username": "******","password": "******"}}' ha = HttpAccess() html = ha.getstring(url, postdata) if html is None: raise Exception("Get accesstoken failed") js = json.loads(html) accesstoken = js["access_token"] cls._accesstoken = accesstoken res = True except Exception as ex: res = False return res
def __init__(self, token: str): assert isinstance(token, str) self._token: str = token self._header = self._header % self._token self._is_logined: bool = False self._login_locker = threading.RLock() self._logger: MsLogger = MsLogManager.get_logger("GitAPIv4") self._ha: HttpAccess = HttpAccess() self._user_name: str = None self._user_login: str = None self._user_id: str = None
def __init__(self): ScoutPlugBase.__init__(self) self._dbip = DbipMmdb() self._rdap_apis = { # arin 加拿大、美国和一些加勒比海岛屿 "arin": "https://rdap.arin.net/registry/ip/", # apnic 亚洲/太平洋地区 "apnic": "https://rdap.apnic.net/history/ip/", # afrinic 非洲地区 "afrinic": "https://rdap.afrinic.net/rdap/ip/", # ripe 欧洲、中东和中亚 "ripe": "https://stat.ripe.net/data/whois/data.json?resource=192.0.20/23", } self._ha: HttpAccess = HttpAccess()
def __init__(self, task: IscoutTask): self.task = task self.tmppath = clienttaskconfig.tmppath self.outpath = clienttaskconfig.outputpath self._ha = HttpAccess() # 插件名字 self._name = type(self).__name__ self._logger: MsLogger = MsLogManager.get_logger( f"{self._name}_{self.task.taskid}") self._sqlfunc = DbManager # 最大的输出条数 self.max_output = 10000 # 新增reason字段,需要对应打击武器的 self.dtools = dtools # 新增数据统计,modify by judy 2020/08/10 self.output_count = 0 # 日志log后缀,create by judy 2020/08/12 self._log_suffix = 'prg_log'
def __init__(self): ProxySpiderbase.__init__(self, False) self._logger: MsLogger = MsLogManager.get_logger( self.__class__.__name__) self._reproxy = re.compile(r'([\d.]+?):(\d+)<br>', re.S) # 慢点,1秒一次请求 self._ha: HttpAccess = HttpAccess(interval=1) self._ha.getstring("http://www.89ip.cn", headers=''' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: en,zh-CN;q=0.9,zh;q=0.8 Cache-Control: no-cache Connection: keep-alive Host: www.89ip.cn Pragma: no-cache Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36''' )
def __init__(self): ScoutPlugBase.__init__(self) self._ha: HttpAccess = HttpAccess() self._host: str = 'securitytrails.com'
class WebTechRecognizer(ScoutPlugBase): """Recognizer for web technologies""" __inst = None __initialed: bool = False __initlocker = threading.RLock() # 文件夹 file = Path(__file__).parents[0] # __appfi = os.path.abspath(os.path.join("./resource/tools/apps.json")) __appfi: Path = file / 'apps.json' __ha: HttpAccess = HttpAccess() _webtechs: dict = {} def __init__(self, task: IscoutTask): ScoutPlugBase.__init__(self) self.task = task self.__init() def __init(self): """check if the resource apps.json is exists, otherwise download it.""" if WebTechRecognizer.__initialed: return with WebTechRecognizer.__initlocker: if WebTechRecognizer.__initialed: return # 如果没有文件那么去下载 if not WebTechRecognizer.__appfi.exists(): self.__download_appfi() # 如果有文件那么去检查下更新,超过7天重新下载 self.__update_appfi() # 初始化json文件,将文件加载到内存 if not self.__init_json(): raise Exception("Init web_tech_recognizer json failed.") WebTechRecognizer.__initialed = True def __download_appfi(self): """download app.json""" url: str = 'https://raw.githubusercontent.com/AliasIO/Wappalyzer/master/src/apps.json' respio: ResponseIO = WebTechRecognizer.__ha.get_response_stream(url) with WebTechRecognizer.__appfi.open(mode='wb') as fs: respio.readinto(fs) def __update_appfi(self): """ 如果存在文件,那么检查下文件是否超过7天, 如果超过7天那么删除重新下载 直接使用unixtime :return: """ file_time = int(WebTechRecognizer.__appfi.stat().st_mtime) now_time = int(datetime.now(pytz.timezone('Asia/Shanghai')).timestamp()) if now_time - file_time > 7 * 24 * 60 * 60: # 先删除文件 WebTechRecognizer.__appfi.unlink() # 重新下载 self.__download_appfi() def __init_json(self) -> bool: """init app.json""" sj = None with open(WebTechRecognizer.__appfi, mode='r', encoding='utf-8') as fs: sj = json.load(fs) if not sj.__contains__("apps"): raise Exception('Key "apps" not found in app.json') if not sj.__contains__("categories"): raise Exception('Key "categories" not found in app.json') for c, v in sj["categories"].items(): WebTech.set_cats(c, v["name"]) for name, source in sj["apps"].items(): webtec: WebTech = WebTech(name, source) WebTechRecognizer._webtechs[name] = webtec return True ####################################### # match def __judge_two_str(self, sstr, dstr): """ 判断两个字符串,是否类似 :param sstr: :param dstr: :return: """ res = False if sstr in dstr or dstr in sstr or sstr == dstr: res = True return res def __url_match(self, url: str): """ 根据url来匹配组件,先之前已经做了很多判断所以这个url是能拿到的 目前好像只能通过一次循环来查找, 这个json里面的url好像是这个组件的url :param url: :return: """ try: for k, v in WebTechRecognizer._webtechs.items(): webtech: WebTech = v if webtech._url is not None and self.__judge_two_str(url, webtech._url): for cat in webtech._cats: name = WebTech.get_cat_name(cat) if name is not None: self._logger.debug(f"Url match a component,name:{name}") yield name # 目前都只去匹配一次,因为情况比较特殊,所以先这么干 break except: self._logger.error(f"Url match error, err:{traceback.format_exc()}") finally: self._logger.info("Complete use url to match component.") def __rheader_match(self, rheader: dict): """ 根据返回的header来匹配 :param rheader: :return: """ try: get = False for k, v in WebTechRecognizer._webtechs.items(): webtech: WebTech = v # 如果发现指纹库里的header不为空,那么就去对比下 if len(webtech._headers) != 0: # 遍历传入的header去对比 for sk, sv in rheader.items(): # 判断信息是否符合 if webtech._headers.__contains__(sk): # 这里的正则好像都有version,但是实际上得到的值可能没有,所以需要判断下 match = False wh = webtech._headers[sk] if 'version' in sv: re_header = re.compile(wh) match = re_header.search(sv) else: # 这里有两种情况'.+?\;version:\d+' if 'version' in wh: try: re_header = re.compile(wh.split('version')[0][:-2]) except: self._logger.debug(f'Cant split version, server:{wh}') else: re_header = re.compile(wh) match = re_header.search(sv) if match: self._logger.debug(f"Header match component, name:{k}") for cat in webtech._cats: category = WebTech.get_cat_name(cat) if category is not None: self._logger.info(f"Rheader match a component,name:{category}") yield (k, v, category) get = True break # 如果遍历到了数据了那么就不再需要继续拿了,一个网站应该不会使用那么技术 # 如果以后需要那么多的数据那么就把这个去掉继续拿 if get: break except: self._logger.error(f"Rheader match error, err:{traceback.format_exc()}") finally: self._logger.info("Complete use rheader to match component.") def __html_match(self, html: str): """ 根据返回的html来匹配 :param html: :return: """ try: for k, v in WebTechRecognizer._webtechs.items(): webtech: WebTech = v if webtech._html is not None: # html只有一个,直接就是正则表达式 match = False if isinstance(webtech._html, str): re_html = re.compile(webtech._html) match = re_html.search(html) else: # html有多个,有多个正则表达式 for he in webtech._html: re_html = re.compile(he) match = re_html.search(html) if match: break if match: self._logger.debug(f"Html match component, name:{k}") for cat in webtech._cats: category = WebTech.get_cat_name(cat) if category is not None: self._logger.info(f"Html match a component, name:{category}") yield (k, v, category) # 目前就只去匹配一次 break except: self._logger.error(f"Html match error, err:{traceback.format_exc()}") finally: self._logger.info("Complete use html to match component.") def _match(self, url: str, respheader: dict, html: str): """ 根据传入的数据匹配相应的组件 return (name, category, version) :param url: :param respheader: :param html: :return: """ # 这里进行很多判断确保条件不会出错,然后去拿相应的数据 # 目前在这只能拿到组件的英文名字,所以直接将数据返回,然后在上一层封装成 # if url is not None and url != '': # for data in self.__url_match(url): # yield data self._logger.info(f"Start to find component, url:{url}") if respheader is not None and len(respheader) > 0: for data in self.__rheader_match(respheader): yield data if html is not None and html != '': for data in self.__html_match(html): yield data def get_match_res(self, level, url): """ 根据url和 :param level: :param url: :return: """ try: res = requests.get(url) # url = res.url rheaders = dict(res.headers) html = res.text match_iter = self._match(url, rheaders, html) for k, v, categroy in match_iter: com = Component(self.task, level, k) com.category = categroy com.url = v._website yield com except: self._logger.error(f"Match component error, err:{traceback.format_exc()}")
def __init__(self, servicetype: str) -> None: LogicalGrabberBase.__init__(self, servicetype) self._ha: HttpAccess = HttpAccess()
class BaiDuTieBa(object): def __init__(self): # self.s = requests.session() self.s = HttpAccess() self.tieba_keyword = '四川' self.tiezi_keyword = ['四川', '德阳'] start_cookie = 'TIEBA_USERTYPE=8f42a94301cb125114b88e7c; wise_device=0; BAIDUID=CB7173B0D9165F60AF77E8ACE3C20897:FG=1; bdshare_firstime=1551248833930; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1551248834; BDUSS=BBdHZRVnhYfnB3aGRKdUViVW9-QXFCUkVJVFUyNWdyUVRMUUpOeWxaU1oyWjFjQUFBQUFBJCQAAAAAAAAAAAEAAAA23WE5yq7UwnNlcHRlbWJlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJlMdlyZTHZcQV; STOKEN=621f6ba3aa1e26cbad20ecfe531ea78659a0ec1878489146ad833b226ce9e2fa; TIEBAUID=f986682cc736e76dfd7f2ee8; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1551258762' self.s._managedCookie.add_cookies('tieba.baidu.com', start_cookie) self.content_url_queue = queue.Queue() self.dealing_queue = [] # 当前文件夹 self.filepath = Path(__file__).parents[0] self.tiebahost = 'https://tieba.baidu.com' def out_formate(self, s: str) -> str: try: return base64.b64encode(s.encode()).decode('utf-8') except Exception as ex: s = repr(s) return base64.b64encode(s.encode()).decode('utf-8') # def update_cookie(self, res: requests.Response, headers): # """ # 更新cookie,和refer_url # 也就相当于更新了cookie # :return: # """ # if res is not None: # cookiedict = res.cookies.get_dict() # cookie_string = ';'.join([str(x) + '=' + str(y) for x, y in cookiedict.items()]) # self.start_cookie += cookie_string # headers['Cookie'] += self.start_cookie # return headers def get_start_url(self): return f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn=0' def judge_key_world_in_title(self, title): res = False try: for el in self.tiezi_keyword: if el in title: res = True except: res = False return res def get_download_links(self): """ 获取需要下载的链接 :return: """ # http://tieba.baidu.com/f?kw=%E5%9B%9B%E5%B7%9D&ie=utf-8&pn=0 # 每次url的增长是50 # 从启始页开始拿 next_page = True nextpagenum = 0 # 最后一页 last_page = None next_url = self.get_start_url() re_title = re.compile( '<a rel="noreferrer" href="(.+?)" title="(.+?)" target="_blank" class="j_th_tit ">.+?</a>') re_next_page = re.compile('pn=(\d+)') while next_page: try: response = self.s.getstring(next_url, headers=''' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Host: tieba.baidu.com Pragma: no-cache Proxy-Connection: keep-alive Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36''') # 更新cookie # headers = self.update_cookie(response, headers) all_find = re_title.findall(response) if len(all_find) > 0: for el_title in all_find: is_key_in = self.judge_key_world_in_title(el_title[1]) if is_key_in: if not el_title[0].startswith('http://'): content_url = el_title[0] self.content_url_queue.put((content_url, el_title[1])) else: print(f"没有获取到此页面,{next_url}") nextpagenum += 50 next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}' print(f'now page num:{nextpagenum-50}, next_url:{next_url}') continue next_page_all = re_next_page.findall(response) # 只用一次 if last_page is None: last_page = next_page_all[-1] # if int(next_page_all[-2]) < int(last_page): if nextpagenum < int(last_page): # nextpagenum = next_page_all[-2] nextpagenum +=50 next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}' print(f'next url:{next_url}') else: next_page = False break except Exception as err: nextpagenum += 50 next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}' print(f'error page:{nextpagenum - 50}, now go next_url:{next_url},error:{err}') continue def get_reply(self, tid, pn, refer, title) -> iter: url = "https://tieba.baidu.com/p/totalComment" querystring = {"tid": tid, "fid": "1", "pn": pn} response = self.s.getstring(url, headers=''' Accept: application/json, text/javascript, */*; q=0.01 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Connection: keep-alive Host: tieba.baidu.com Pragma: no-cache Referer: {} User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36 X-Requested-With: XMLHttpRequest'''.format(refer), params=querystring) # 更新下cookie,虽然没有用 # headers = self.update_cookie(response, headers) data = json.loads(response) if data.get('errno') != 0 or data.get('errmsg') != 'success': return [] c_list = data.get('data').get('comment_list') if c_list is None: return [] if len(c_list) == 0: return [] for key, value in c_list.items(): for el in value.get('comment_info'): try: write_line = {} s_id = el.get('comment_id') post_id = el.get('post_id') author = el.get('username') content = el.get('content') if content is None or content == '': continue g_time = el.get('now_time') # unixtime g_d_time = str(datetime.datetime.fromtimestamp(g_time)) write_line['id'] = s_id write_line['replyid'] = post_id write_line['author'] = self.out_formate(author) write_line['title'] = self.out_formate(title) write_line['stars'] = None write_line['content'] = self.out_formate(content) write_line['resources'] = None write_line['createtime'] = g_d_time write_line['updatetime'] = None write_line['likes'] = None yield write_line except Exception as err: print(f"获取当前页的评论出错,err:{err}") continue def get_content_info(self): re_next_page = re.compile('<a href="(.+?)">下一页</a>') no_data_times = 0 with threading.Lock(): csvfile = open(self.tieba_keyword + '.csv', 'a', newline='') fieldnames = ['id', 'replyid', 'author', 'title', 'stars', 'content', 'resources', 'createtime', 'updatetime', 'likes'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # writer.writeheader() while True: if no_data_times > 50: print("no data to crawel") break if self.content_url_queue.empty(): time.sleep(3) no_data_times += 1 continue url_info = self.content_url_queue.get() url = url_info[0] title = url_info[1] has_next = True next_url = None # 拿评论需要的信息 pn = 1 tid = re.search('\d+', url).group() while has_next: # 第一次访问url try: if next_url is None: next_url = self.tiebahost + url response = self.s.getstring(next_url, headers=''' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Host: tieba.baidu.com Pragma: no-cache Proxy-Connection: keep-alive Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36''') # 更新cookie # headers = self.update_cookie(response, headers) soup = BeautifulSoup(response, 'lxml') get_div_info = soup.find('div', attrs={'class': 'p_postlist'}) all_content_divs = get_div_info.contents for content_div in all_content_divs: try: write_line = {} data_info = content_div.get('data-field') if data_info is None: continue data = json.loads(data_info) floorid = data.get('content').get('post_id') author = data.get('author').get('user_name') re_get_time = re.search('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', str(content_div)) if re_get_time: get_time = re_get_time.group() + ':00' write_line['createtime'] = get_time # 文本内容和图片 content_info = content_div.find('div', attrs={'class': re.compile('.+?j_d_post_content.+?')}) content = content_info.text.strip() imgs_info = content_info.find_all('img', attrs={'class': 'BDE_Image'}) if content is None and len(imgs_info)==0: continue if len(imgs_info) ==0: resources = [] for img_info in imgs_info: img = img_info.get('src') resources.append(img) write_line['resources'] = self.out_formate(json.dumps(resources)) # 数据写入 write_line['id'] = floorid write_line['replyid'] = None write_line['author'] = self.out_formate(author) write_line['title'] = self.out_formate(title) write_line['stars'] = None write_line['content'] = self.out_formate(content) write_line['updatetime'] = None write_line['likes'] = None with threading.Lock(): writer.writerow(write_line) print(f'Write a line:{write_line}') except Exception as err: print(f"获取某个楼层出错,err:{err}") continue # 获取评论 for comm in self.get_reply(tid, pn, url, title): with threading.Lock(): writer.writerow(comm) print(f'Write a commit:{comm}') # 获取下一页 nextpage = re_next_page.search(response) if nextpage: next_url = self.tiebahost + nextpage.group(1) pn += 1 else: has_next = False except Exception as err: print(f"获取这页的url出错:{url}, err:{err}") has_next = False self.content_url_queue.task_done() print("complex") csvfile.close() def start(self): csvfile = open(self.tieba_keyword + '.csv', 'a', newline='') fieldnames = ['id', 'replyid', 'author', 'title', 'stars', 'content', 'resources', 'createtime', 'updatetime', 'likes'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() csvfile.close() thread1 = threading.Thread(target=self.get_download_links, name="get_download_link") thread1.start() for i in range(3): threads = threading.Thread(target=self.get_content_info, name="get_comments") threads.start()
class DeYang(object): imagesnumbel = 0 def __init__(self): self._ha = HttpAccess() cookies = '_RF1=101.204.79.78; _RSG=7t4K9DysapAAy3T6IzZvP9; _RDG=28c3a46e16bd9527e206056f639f93f12d; _RGUID=ace4dbc3-4950-4dc7-9679-8fd486743f0a; ASP.NET_SessionSvc=MTAuOC4xODkuNTV8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU0NzYzNTY5NDYxNA; bdshare_firstime=1550397871920; MKT_Pagesource=PC; _ga=GA1.2.1090470229.1550397875; _gid=GA1.2.111071048.1550397875; _bfa=1.1550397832747.3uetxn.1.1550397832747.1550397832747.1.4; _bfs=1.4; gad_city=be2e953e1ae09d16d9cc90a550611388; __zpspc=9.1.1550397884.1550397884.1%234%7C%7C%7C%7C%7C%23; _jzqco=%7C%7C%7C%7C1550397884384%7C1.1018365145.1550397884256.1550397884256.1550397884256.1550397884256.1550397884256.0.0.0.1.1; _bfi=p1%3D290510%26p2%3D290546%26v1%3D4%26v2%3D3; appFloatCnt=3' self._ha._managedCookie.add_cookies('ctrip.com', cookies) self.page_url = queue.Queue() self.que_dealing = [] # 当前文件夹 self.filepath = Path(__file__).parents[0] def out_formate(self, s: str) -> str: try: return base64.b64encode(s.encode()).decode('utf-8') except Exception as ex: s = repr(s) return base64.b64encode(s.encode()).decode('utf-8') def get_ctrip_link(self): re_name = re.compile( '<a target="_blank" href="(/.+?)" title="(.+?)">.+?</a>') for n in range(5): url = f"http://you.ctrip.com/sight/deyang462/s0-p{n + 1}.html" html_2 = self._ha.getstring(url, headers=''' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Host: you.ctrip.com Pragma: no-cache Proxy-Connection: keep-alive Referer: http://you.ctrip.com/ Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36''' ) name_info = re_name.findall(html_2) for name_one in name_info: self.page_url.put(name_one) print("所有要下载的链接均已获得") return def get_content_info(self, poid, did, dname, pageall, rid, dirloc: Path): url = "http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView" for page in range(int(pageall)): payload = "poiID={}&districtId={}&districtEName={}&" \ "pagenow={}&order=3.0&star=0.0&tourist=0.0" \ "&resourceId={}&resourcetype=2".format(poid, did, dname, page+1, rid) page_html = self._ha.getstring(url, payload, headers=''' Accept: */* Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Content-Length: 125 Content-Type: application/x-www-form-urlencoded Host: you.ctrip.com Origin: http://you.ctrip.com Pragma: no-cache Proxy-Connection: keep-alive Referer: http://you.ctrip.com/ User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36 X-Requested-With: XMLHttpRequest ''') soup = BeautifulSoup(page_html, 'lxml') all_username = soup.find_all('div', attrs={"class": "userimg"}) comments_divs = soup.find_all('div', attrs={"class": "comment_ctrip"}) all_ul = comments_divs[0].find_all('ul') for i in range(len(all_ul)): try: line = {} name = all_username[i].get_text(strip=True) line['author'] = self.out_formate(name) all_lis = all_ul[i].find_all('li') stars_info = all_lis[0].get_text() stars = re.findall('\d', stars_info) line['title'] = None if len(stars) > 0: get_starts = stars[-1] line['stars'] = get_starts + '/5' else: line['stars'] = None des = all_lis[1].get_text(strip=True) line['content'] = self.out_formate(des) if len(all_lis) == 4: all_pics = [] # 有图片 all_a = all_lis[2].find_all('a') for a_one in all_a: with threading.Lock(): jpg_url = a_one.get('href') # 下载图片 jpg_locname = str(DeYang.imagesnumbel) + '.jpg' img = requests.get(jpg_url) jpg_loc: Path = dirloc / jpg_locname with jpg_loc.open('ab') as f: f.write(img.content) f.close() print(f"download complete:{jpg_locname}") all_pics.append(jpg_locname) DeYang.imagesnumbel += 1 line['pictures'] = json.dumps(all_pics) else: line['pictures'] = None others_info = all_lis[-1] useful_info = others_info.get_text(strip=True) useful = re.findall('\((\d+)\)', useful_info) if len(useful) > 0: useful_res = useful[-1] else: useful_res = None time = others_info.find( 'span', attrs={ "class": "time_line" }).get_text(strip=True) + ' 00:00:00' line['createtime'] = None line['updatetime'] = None line['time'] = time line['replyto'] = None line['likes'] = useful_res yield line except Exception as ex: print(f'解析一行出错:{ex}') continue def get_content(self): re_content_pages = re.compile('<b class="numpage">(\d+)</b>') re_poid = re.compile( '<a href="/dianping/edit/(\d+).html" class="b_orange_m">') while True: # 去队列里取一个url('url', 'name') try: url_info = self.page_url.get() if url_info in self.que_dealing: print("这个url正在下载或者已经下载完成,跳过") continue # 放进正在处理的列表里 self.que_dealing.append(url_info) url = url_info[0] dirname = url_info[1] dir_loc = self.filepath / dirname dir_loc.mkdir(exist_ok=True) csvfilename = dir_loc / (dirname + '.csv') csvfile = open(str(csvfilename), 'w', newline='') fieldnames = [ 'author', 'title', 'stars', 'content', 'pictures', 'createtime', 'updatetime', 'time', 'replyto', 'likes' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() infos = url.split('/') dname = re.findall('[a-zA-Z]+', infos[2])[0] did = re.findall('\d+', infos[2])[0] rid = re.findall('\d+', infos[-1])[0] url = "http://you.ctrip.com" + url # proxy = self.get_proxy() # response = requests.get(url, headers=self.get_headers(url)) response = self._ha.getstring(url, headers=''' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9,en;q=0.8 Cache-Control: no-cache Host: you.ctrip.com Pragma: no-cache Proxy-Connection: keep-alive Referer: http://you.ctrip.com/ Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36''' ) pages = re_content_pages.findall(response) if len(pages) == 0: pages = 1 else: pages = pages[0] poid = re_poid.findall(response)[0] getline = self.get_content_info(poid, did, dname, pages, rid, dir_loc) for a_line in getline: print(f"获取到一行数据:{a_line}") try: writer.writerow(a_line) except Exception as err: print(f"Write line error:{err}\nline:{a_line}") continue self.page_url.task_done() # 最后所有任务执行完成后break if self.page_url.empty(): break except Exception as err: print(f"获取一个url出错,url:{url}, name:{dirname}, error:{err}") continue # self.delete_proxy(proxy) print("complte") return def start(self): thread1 = threading.Thread(target=self.get_ctrip_link, name="get_start_links") thread1.start() for i in range(10): threads = threading.Thread(target=self.get_content, name="writeinfo") threads.start()