class Base(object): def __init__(self): self.local = LOCAL self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' } def _init_pool(self): if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.sql_pool = PyMysqlPoolBase(**conf) def _get_proxy(self): if self.local: return requests.get(LOCAL_PROXY_URL).text.strip() else: random_num = random.randint(0, 10) if random_num % 2: time.sleep(1) return requests.get(PROXY_URL).text.strip() else: return requests.get(LOCAL_PROXY_URL).text.strip() def get(self, url): count = 0 while True: count += 1 if count > 10: return None try: proxy = {"proxy": self._get_proxy()} print("proxy is >> {}".format(proxy)) resp = requests.get(url, headers=self.headers, proxies=proxy) except: traceback.print_exc() time.sleep(0.5) else: if resp.status_code == 200: return resp elif resp.status_code == 404: return None else: print("status_code: >> {}".format(resp.status_code)) time.sleep(1) pass def convert_dt(self, time_stamp): d = str(datetime.datetime.fromtimestamp(time_stamp)) return d def _contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) ks = sorted(ks) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}` '''.format( self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql def _filter_char(self, test_str): # 处理特殊的空白字符 # '\u200b' 是 \xe2\x80\x8b for cha in [ '\n', '\r', '\t', '\u200a', '\u200b', '\u200c', '\u200d', '\u200e', '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', ]: test_str = test_str.replace(cha, '') test_str = test_str.replace(u'\xa0', u' ') # 把 \xa0 替换成普通的空格 return test_str def _process_content(self, vs): # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错 try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') params = list() for v in vs: # 对插入数据进行一些处理 nv = highpoints.sub(u'', v) nv = self._filter_char(nv) params.append(nv) content = "".join(params).strip() return content def _get_values(self, item: dict): # self.fields: [] 插入所需字段列表 同时与上文的 ks = sorted(ks) 对应 value = tuple(item.get(field) for field in sorted(self.fields)) return value def _save(self, item): insert_sql = self._contract_sql(item) value = self._get_values(item) try: ret = self.sql_pool.insert(insert_sql, value) except pymysql.err.IntegrityError: print("重复数据 ") return 1 except: traceback.print_exc() else: return ret def _save_many(self, items): values = [self._get_values(item) for item in items] # list of tuple insert_many_sql = self._contract_sql(items[0]) try: ret = self.sql_pool.insert_many(insert_many_sql, values) except pymysql.err.IntegrityError: print("批量中有重复数据") except: traceback.print_exc() else: return ret finally: self.sql_pool.end() def save_one(self, item): self._save(item) self.sql_pool.end() def save(self, items): ret = self._save_many(items) if not ret: print("批量保存失败 开始单独保存 .. ") count = 0 for item in items: print(item) self._save(item) count += 1 if count > 9: self.sql_pool.end() count = 0 # self.sql_pool.dispose() self.sql_pool.end() else: print("批量成功..") print(items) print(len(items)) def __del__(self): try: self.sql_pool.dispose() except: pass def start(self): try: self._init_pool() self._start() except: traceback.print_exc()
class Money163(object): def __init__(self): self.list_url = "http://money.163.com/special/00251G8F/news_json.js" self.extractor = GeneralNewsExtractor() self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", } self.local = LOCAL if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) self.table = "netease_money" self.error_detail = [] def _parse_list(self, body): js_obj = re.findall(r"news:(.*)\};", body)[0] py_obj = demjson.decode(js_obj) for type in py_obj: # 得到每一个子主题 for data in type: yield data def _parse_detail(self, detail_url): try: page = requests.get(detail_url, headers=self.headers).text result = self.extractor.extract(page) content = result.get("content") except: return return content def contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self.contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: # print("重复") return 1 except: print("失败") else: return count def get_proxy(self): if self.local: return requests.get(LOCAL_PROXY_URL).text.strip() else: return requests.get(PROXY_URL).text.strip() def get_list_resp(self): count = 0 while True: proxy = self.get_proxy() print(">> ", proxy) try: list_resp = requests.get(self.list_url, proxies={"http": proxy}, timeout=3) except: count += 1 if count > 10: return time.sleep(1) else: if list_resp.status_code != 200: count += 1 if count > 10: return time.sleep(1) else: break return list_resp def __del__(self): try: self.sql_pool.dispose() except: pass def close(self): try: self.sql_pool.dispose() except: pass def start(self): try: self._start() except: traceback.print_exc() finally: self.close() def _start(self): list_resp = self.get_list_resp() print(">>>", list_resp) if list_resp and list_resp.status_code == 200: body = list_resp.text # TODO 如果不转为 list,直接使用生成器时插入数据库会失败.. ret = list(self._parse_list(body)) count = 0 for one in ret: # print(one) item = dict() link = one.get("l") item['link'] = link item['title'] = one.get("t") # 在返回的 json 数据中 最新的数据在最前面 定时+增量 只需要爬取大于当前时间一天之前的新闻 # 保险起见 设置为 2 # dt = datetime.datetime.today() - datetime.timedelta(days=1) pub_date = one.get("p") # pt = datetime.datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S") # bug fixed 因为这里是不同的栏目穿插 所以这么判断会少数据 # if pt < dt: # print(pt) # print(dt) # print('网易财经增量完毕 ') # return item['pub_date'] = pub_date article = self._parse_detail(one.get("l")) if article: item['article'] = article # print(item.get("title")) ret = self._save(item) if not ret: print("保存失败 ") self.error_detail.append(link) else: count += 1 else: self.error_detail.append(link) if count > 9: print("提交.. ") self.sql_pool.end() count = 0 self.sql_pool.dispose()
class STCN_Base(object): def __init__(self): self.table = "stcn_info" self.local = LOCAL self.check_dt = datetime.datetime.today() - datetime.timedelta(days=2) self.dt_fmt = '%Y-%m-%d' # if self.local: # conf = { # "host": LOCAL_MYSQL_HOST, # "port": LOCAL_MYSQL_PORT, # "user": LOCAL_MYSQL_USER, # "password": LOCAL_MYSQL_PASSWORD, # "db": LOCAL_MYSQL_DB, # } # else: # conf = { # "host": MYSQL_HOST, # "port": MYSQL_PORT, # "user": MYSQL_USER, # "password": MYSQL_PASSWORD, # "db": MYSQL_DB, # } # self.sql_pool = PyMysqlPoolBase(**conf) # 默认是不需要翻页的 self.pages = False self.extractor = GeneralNewsExtractor() def _init_pool(self): if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.sql_pool = PyMysqlPoolBase(**conf) def _get(self, url): resp = requests.get(url) if resp.status_code == 200: return resp.text def _extract_content(self, body): result = self.extractor.extract(body) content = result.get("content") return content def _parse_detail(self, body): try: doc = html.fromstring(body) node = doc.xpath("//div[@class='txt_con']")[0] content = node.text_content() except: content = None else: return content if not content: content = self._extract_content(body) return content def _filter_char(self, test_str): # 处理特殊的空白字符 # '\u200b' 是 \xe2\x80\x8b for cha in [ '\n', '\r', '\t', '\u200a', '\u200b', '\u200c', '\u200d', '\u200e', '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', ]: test_str = test_str.replace(cha, '') test_str = test_str.replace(u'\xa0', u' ') # 把 \xa0 替换成普通的空格 return test_str def _process_content(self, vs): # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错 try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') params = list() for v in vs: # 对插入数据进行一些处理 nv = highpoints.sub(u'', v) nv = self._filter_char(nv) params.append(nv) content = "".join(params).strip() return content def _contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) ks = sorted(ks) # article,link,pub_date,title fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}` '''.format( self.table) + fields_str + ''' values ''' + values_str + ''';''' # return base_sql, tuple(vs) return base_sql def _save(self, item): insert_sql = self._contract_sql(item) # print(insert_sql) value = (item.get("article"), item.get("link"), item.get("pub_date"), item.get("title")) # print(value) try: ret = self.sql_pool.insert(insert_sql, value) except pymysql.err.IntegrityError: # print("重复数据 ") return 1 except: traceback.print_exc() else: return ret def _save_many(self, items): values = [(item.get("article"), item.get("link"), item.get("pub_date"), item.get("title")) for item in items] insert_many_sql = self._contract_sql(items[0]) try: ret = self.sql_pool.insert_many(insert_many_sql, values) except pymysql.err.IntegrityError: print("批量中有重复数据") except: traceback.print_exc() else: return ret finally: self.sql_pool.end() def _add_article(self, item: dict): link = item.get("link") if link: detail_page = self._get(link) if detail_page: article = self._parse_detail(detail_page) if article: item['article'] = article return True return False def _check_dt(self, pub_dt): if not pub_dt: return False try: pub_dt = datetime.datetime.strptime(pub_dt[:10], self.dt_fmt) except: print("截取增量时间点失败.. 重新爬取.. ") # traceback.print_exc() return False if pub_dt < self.check_dt: print("当前天: ", pub_dt) print("检查时刻: ", self.check_dt) print("增量结束 .. ") return True else: return False def _start(self): self._init_pool() if not self.pages: list_body = self._get(self.list_url) if list_body: items = self._parse_list_body(list_body) count = 0 for item in items: if self._check_dt(item.get("pub_date")): self.sql_pool.end() return ret = self._save(item) if ret: count += 1 # print("保存成功: {}".format(item)) else: # print("保存失败: {}".format(item)) pass if count > 9: self.sql_pool.end() print("提交 .. ") count = 0 self.sql_pool.dispose() else: count = 0 for page in range(1, self.page_num + 1): print("\nThe page is {}".format(page)) list_url = self.format_url.format(page) print(list_url) list_body = self._get(list_url) if list_body: items = self._parse_list_body(list_body) for item in items: if self._check_dt(item.get("pub_date")): self.sql_pool.end() return ret = self._save(item) if ret: count += 1 # print("保存成功: {}".format(item)) else: # print("保存失败: {}".format(item)) pass if count > 9: self.sql_pool.end() print("提交 .. ") count = 0 def __del__(self): try: self.sql_pool.dispose() except: pass def start(self): print("{} 开始爬取".format(self.name)) try: self._start() except: traceback.print_exc() print("{} 爬取失败".format(self.name))
class CArticleBase(object): # 东财-财富号的基类 def __init__(self, key): self.local = LOCAL self.key = key print(self.key, "\n\n\n") self.start_url = 'http://api.so.eastmoney.com/bussiness/Web/GetSearchList?' self.page_size = 10 self.headers = { "Referer": "http://so.eastmoney.com/CArticle/s?keyword={}".format(self.key.encode()), "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", } self.table = "eastmoney_carticle" self.error_detail = [] self.error_list = [] self.proxy = self._get_proxy() self.dt_format = '%Y-%m-%d %H:%M:%S' self.limit_time = datetime.datetime(2020, 2, 1) self.use_proxy = 1 def _init_pool(self): if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) def make_query_params(self, msg, page): query_params = { 'type': '8224', # 该参数表明按时间排序 'pageindex': str(page), 'pagesize': str(self.page_size), 'keyword': msg, 'name': 'caifuhaowenzhang', 'cb': 'jQuery{}_{}'.format( ''.join(random.choice(string.digits) for i in range(0, 21)), str(int(time.time() * 1000)) ), '_': str(int(time.time() * 1000)), } return query_params def contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self.contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: # print("重复 ") return 1 except: print("失败") traceback.print_exc() return else: return count def _get_proxy(self): if self.local: return requests.get(LOCAL_PROXY_URL).text.strip() else: # 为了不轻易崩 ip 线上的正式环境 混用 .. random_num = random.randint(0, 10) if random_num % 2: time.sleep(1) return requests.get(PROXY_URL).text.strip() else: return requests.get(LOCAL_PROXY_URL).text.strip() def _delete_detail_404(self, url): delete_sql = f"delete from `{self.table}` where link = {url};" ret = self.sql_pool.delete(delete_sql) self.sql_pool.end() if ret: print(f"删除无效的 url: {url}") def _crawl(self, url, proxy): proxies = {'http': proxy} r = requests.get(url, proxies=proxies, headers=self.headers, timeout=3) return r def _get(self, url): if self.use_proxy: count = 0 while True: count = count + 1 try: resp = self._crawl(url, self.proxy) if resp.status_code == 200: return resp elif resp.status_code == 404: self._delete_detail_404(url) return None elif count > 2: print(f'抓取网页{url}最终失败') break else: self.proxy = self._get_proxy() print(f"无效状态码{resp.status_code}, 更换代理{self.proxy}\n") except: self.proxy = self._get_proxy() print(f'代理失败,更换代理{self.proxy} \n') else: try: resp = requests.get(url) except: return return resp def _parse_detail(self, detail_page): doc = html.fromstring(detail_page) article_body = doc.xpath('//div[@class="article-body"]/*') contents = [] for p_node in article_body: children = p_node.getchildren() children_tags = [child.tag for child in children] if children_tags and "img" in children_tags: img_links = p_node.xpath("./img/@src") # list contents.append(",".join(img_links)) else: contents.append(p_node.text_content()) contents = "\r\n".join(contents) return contents def transferContent(self, content): if content is None: return None else: string = "" for c in content: if c == '"': string += '\\\"' elif c == "'": string += "\\\'" elif c == "\\": string += "\\\\" else: string += c return string def _filter_char(self, test_str): # 处理特殊的空白字符 # '\u200b' 是 \xe2\x80\x8b for cha in ['\n', '\r', '\t', '\u200a', '\u200b', '\u200c', '\u200d', '\u200e', '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', ]: test_str = test_str.replace(cha, '') test_str = test_str.replace(u'\xa0', u' ') # 把 \xa0 替换成普通的空格 return test_str def _process_content(self, vs): # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错 try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') params = list() for v in vs: # 对插入数据进行一些处理 nv = highpoints.sub(u'', v) nv = self._filter_char(nv) params.append(nv) return "".join(params) def _get_list(self, list_url): resp = self._get(list_url) if resp: return resp.text else: self.error_list.append(list_url) def _get_detail(self, detail_url): resp = self._get(detail_url) if resp: return resp.text else: self.error_detail.append(detail_url) def _parse_list(self, list_page): try: json_data = re.findall(r'jQuery\d{21}_\d{13}\((\{.*?\})\)', list_page)[0] list_data = json.loads(json_data).get("Data") except: return None else: if list_data: return list_data else: return [] def __del__(self): try: self.sql_pool.dispose() except: pass def close(self): try: self.sql_pool.dispose() except: pass def start(self): try: self._start() except: traceback.print_exc() finally: self.close() def _start(self): # 本类是针对某一个具体的 code 来进行爬取的 # 所以在 _start 之外还会有一个总的 "调度函数" self._init_pool() # (1) 生成 list_url for page in range(1, 2): # print(page) list_url = self.start_url + urlencode(self.make_query_params(self.key, page)) # print(list_url) # (2) 获取列表页 list_page = self._get_list(list_url) # print(list_page) # (3) 从列表页解析数据 返回列表 list_infos = self._parse_list(list_page) # print(pprint.pformat(list_infos)) if list_infos: # # 增量的过程中不再继续爬取 # show_times = [datetime.datetime.strptime(info.get("ShowTime"), self.dt_format) for info in list_infos] # # print(show_times) # if max(show_times) < self.limit_time: # print("增量完毕") # return count = 0 # (4) 解析详情页 保存数据 for data in list_infos: item = dict() item['code'] = self.key link = data.get("ArticleUrl") item['link'] = link item['title'] = data.get("Title") item['pub_date'] = data.get("ShowTime") detail_page = self._get_detail(link) if detail_page: article = self._parse_detail(detail_page) # 对文章进行处理 以防插入失败.. article = self._process_content(article) item['article'] = article print("item", item) ret = self._save(item) if not ret: print(f"插入失败 {item.get('link')}") else: count += 1 if count > 10: self.sql_pool.end() count = 0 self.sql_pool.end() # self.sql_pool.connection.commit() print(f"第{page}页保存成功") self.close()
class CArticleLoder(object): def __init__(self, key): # 本地运行亦或者是在服务器上运行 self.local = LOCAL # 是否使用阿布云代理 self.abu = False # 股票代码中文简称 self.key = key self.start_url = 'http://api.so.eastmoney.com/bussiness/Web/GetSearchList?' self.page_size = 10 self.headers = { "Referer": "http://so.eastmoney.com/CArticle/s?keyword={}".format( self.key.encode()), "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", } self.db = MYSQL_DB self.table = "eastmoney_carticle" if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.sql_pool = PyMysqlPoolBase(**conf) # 不使用阿布云的情况下 初始化代理 if not self.abu: self.proxy = self._get_proxy() # 记录出错的列表页 以及 详情页 url self.error_detail = [] self.error_list = [] def make_query_params(self, msg, page): query_params = { 'type': '8224', # 该参数表明按时间排序 'pageindex': str(page), 'pagesize': str(self.page_size), 'keyword': msg, 'name': 'caifuhaowenzhang', 'cb': 'jQuery{}_{}'.format( ''.join(random.choice(string.digits) for i in range(0, 21)), str(int(time.time() * 1000))), '_': str(int(time.time() * 1000)), } return query_params def contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self.contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: logger.warning("重复 ") except: logger.warning("失败") else: return count def _abu_get(self, url): """使用阿布云代理 默认失败后重新发起请求""" proxy_host = "http-cla.abuyun.com" proxy_port = 9030 # 代理隧道验证信息 proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass, } proxies = { "http": proxy_meta, "https": proxy_meta, } retry = 2 # 重试三次 事不过三^_^ while True: try: resp = requests.get( url, proxies=proxies, headers=self.headers, timeout=3, ) if resp.status_code == 200: return resp else: print(resp.status_code, "retry") retry -= 1 if retry <= 0: return None time.sleep(3) except: print("error retry") retry -= 1 if retry <= 0: return None time.sleep(3) # def _get_proxy(self): # if self.local: # r = requests.get('http://192.168.0.102:8888/get') # else: # r = requests.get('http://172.17.0.4:8888/get') # proxy = r.text # return proxy def _get_proxy(self): if self.local: return requests.get(LOCAL_PROXY_URL).text.strip() else: return requests.get(PROXY_URL).text.strip() # def _get_proxy(self): # # 获取一个可用代理 如果当前没有可用的话 就 sleep 3 秒钟 # if self.local: # while True: # count = requests.get(LOCAL_PROXY_URL.format("count")) # if count: # resp = requests.get(LOCAL_PROXY_URL.format("get")) # break # else: # print("当前无可用代理, 等一会儿 ") # time.sleep(3) # return resp.text # else: # while True: # count = requests.get(PROXY_URL.format("count")) # if count: # resp = requests.get(PROXY_URL.format("get")) # break # else: # print("当前无可用代理, 等一会儿 ") # time.sleep(3) # return resp.text def _delete_detail_404(self, url): delete_sql = f"delete from `{self.table}` where link = {url};" ret = self.sql_pool.delete(delete_sql) self.sql_pool.end() if ret: print(f"删除无效的 url: {url}") def _crawl(self, url, proxy): proxies = {'http': proxy} r = requests.get(url, proxies=proxies, headers=self.headers, timeout=3) return r def _get(self, url): if self.abu: return self._abu_get(url) count = 0 while True: count = count + 1 try: resp = self._crawl(url, self.proxy) if resp.status_code == 200: return resp elif resp.status_code == 404: self._delete_detail_404(url) return None elif count > 2: logger.warning(f'抓取网页{url}最终失败') break else: self.proxy = self._get_proxy() logger.warning( f"无效状态码{resp.status_code}, 更换代理{self.proxy}\n") except: self.proxy = self._get_proxy() logger.warning(f'代理失败,更换代理{self.proxy} \n') def _parse_detail(self, detail_page): doc = html.fromstring(detail_page) article_body = doc.xpath('//div[@class="article-body"]/*') contents = [] for p_node in article_body: children = p_node.getchildren() children_tags = [child.tag for child in children] if children_tags and "img" in children_tags: img_links = p_node.xpath("./img/@src") # list contents.append(",".join(img_links)) else: contents.append(p_node.text_content()) contents = "\r\n".join(contents) return contents def _select_key_links(self): select_all_sql = f"select link from {self.table} where code = '{self.key}' and article is NULL;" # links = self.sql_pool.select_many(select_all_sql, size=10) links = self.sql_pool.select_all(select_all_sql) return links def _select_rest_all_links(self): select_all_sql = f"select id, link from {self.table} where article is NULL;" # links = self.sql_pool.select_many(select_all_sql, size=20) links = self.sql_pool.select_all(select_all_sql) return links def transferContent(self, content): if content is None: return None else: string = "" for c in content: if c == '"': string += '\\\"' elif c == "'": string += "\\\'" elif c == "\\": string += "\\\\" else: string += c return string def _filter_char(self, test_str): # 处理特殊的空白字符 # '\u200b' 是 \xe2\x80\x8b for cha in [ '\n', '\r', '\t', '\u200a', '\u200b', '\u200c', '\u200d', '\u200e', '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', ]: test_str = test_str.replace(cha, '') test_str = test_str.replace(u'\xa0', u' ') # 把 \xa0 替换成普通的空格 return test_str def _process_content(self, vs): # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错 try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') params = list() for v in vs: # 对插入数据进行一些处理 nv = highpoints.sub(u'', v) nv = self._filter_char(nv) params.append(nv) return "".join(params) def _update_detail(self, link, article): # 直接插入文本内容可能出错 需对其进行处理 # article = self.transferContent(article) article = self._process_content(article) print("文章内容是: \n", article) update_sql = f"update {self.table} set article =%s where link =%s;" try: ret = self.sql_pool.update(update_sql, [(article), (link)]) # ret = self.sql_pool.update(update_sql) except: traceback.print_exc() print("插入失败") return None else: return ret def _get_list(self, list_url): resp = self._get(list_url) if resp: return resp.text else: self.error_list.append(list_url) def _get_detail(self, detail_url): resp = self._get(detail_url) if resp: return resp.text else: self.error_detail.append(detail_url) def _parse_list(self, list_page): try: json_data = re.findall(r'jQuery\d{21}_\d{13}\((\{.*?\})\)', list_page)[0] list_data = json.loads(json_data).get("Data") except: return None else: if list_data: return list_data else: return [] def _save_one_page_list(self, page): list_url = self.start_url + urlencode( self.make_query_params(self.key, page)) list_page = self._get_list(list_url) if list_page: list_infos = self._parse_list(list_page) # list if not list_infos: logger.info(f"{self.key} 爬取完毕 ") return for data in list_infos: item = dict() item['code'] = self.key link = data.get("ArticleUrl") item['link'] = link item['title'] = data.get("Title") item['pub_date'] = data.get("ShowTime") print("item", item) ret = self._save(item) if not ret: logger.warning(f"插入失败 {item}") self.sql_pool.end() # self.sql_pool.connection.commit() print(f"第{page}页保存成功") return page def __del__(self): try: self.sql_pool.dispose() except: pass
class qqStock(object): def __init__(self): self.local = LOCAL self.token = "8f6b50e1667f130c10f981309e1d8200" self.headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36" } self.list_url = "https://pacaio.match.qq.com/irs/rcd?cid=52&token={}" \ "&ext=3911,3922,3923,3914,3913,3930,3915,3918,3908&callback=__jp1".format(self.token) # self.proxy = None self.extractor = GeneralNewsExtractor() self.local = LOCAL if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) self.table = "qq_Astock_news" self.error_detail = [] # def _get_proxy(self): # if self.local: # proxy_url = "http://192.168.0.102:8888/get" # else: # proxy_url = "http://172.17.0.5:8888/get" # r = requests.get(proxy_url) # proxy = r.text # return proxy def _crawl(self, url): r = requests.get(url, headers=self.headers, timeout=3) return r # proxies = {'http': proxy} # r = requests.get(url, proxies=proxies, headers=self.headers, timeout=3) # return r def _get(self, url): count = 0 while True: count = count + 1 try: resp = self._crawl(url) if resp.status_code == 200: return resp elif count > 2: break # else: # self.proxy = self._get_proxy() # print(f"无效状态码{resp.status_code}, 更换代理{self.proxy}\n") except (ChunkedEncodingError, ConnectionError, Timeout, UnboundLocalError, UnicodeError, ProxyError): # self.proxy = self._get_proxy() # print(f'代理连接失败,更换代理{self.proxy}\n') if count > 2: break print(f'抓取网页{url}最终失败') def _parse_article(self, vurl): detail_page = self._get(vurl) if detail_page: result = self.extractor.extract(detail_page.text) return result.get("content") def _parse_list(self): list_resp = self._get(self.list_url) if list_resp: print("请求主列表页成功 ") body = list_resp.text body = body.lstrip("__jp1(") body = body.rstrip(")") body = json.loads(body) datas = body.get("data") specials = [] articles = [] for data in datas: if data.get("article_type") == 120: specials.append(data) elif data.get("article_type") == 0: articles.append(data) else: print("爬取到预期外的数据{}".format(data)) print("爬取到预期外的数据类型{}".format( data.get("article_type"))) # 56 视频类型 不再爬取 return specials, articles def _contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self._contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: # logger.warning("重复") return 1 except: logger.warning("失败") else: return count def _start(self): specials, articles = self._parse_list() for article in articles: item = {} vurl = article.get("vurl") item['link'] = vurl item['pub_date'] = article.get("publish_time") item['title'] = article.get("title") article = self._parse_article(vurl) if article: item['article'] = article # print(item) ret = self._save(item) if not ret: print('保存失败') self.error_detail.append(vurl) else: self.error_detail.append(vurl) print("开始处理专题页") for special in specials: special_id = special.get("app_id") special_url = "https://pacaio.match.qq.com/openapi/getQQNewsSpecialListItems?id={}&callback=getSpecialNews".format( special_id) ret = self._get(special_url).text ret = ret.lstrip("""('getSpecialNews(""") ret = ret.rstrip(""")')""") jsonobj = json.loads(ret) # print(jsonobj) data = jsonobj.get("data") id_list = data.get("idlist") for one in id_list: new_list = one.get('newslist') for new in new_list: # print("标题:", new.get("longtitle"), end=",") # # print("链接:", new.get("surl"), end=",") # # "https://new.qq.com/omn/{}/{}.html".format(id[:6], id) # id = new.get("id") # print("链接:", "https://new.qq.com/omn/{}/{}.html".format(id[:8], id), end=",") # print("发布时间:", new.get("time")) item = {} id = new.get("id") link = "https://new.qq.com/omn/{}/{}.html".format( id[:8], id) title = new.get("longtitle") pub_date = new.get("time") if link and title and pub_date: article = self._parse_article(link) if article: item['link'] = link item['pub_date'] = pub_date item['title'] = title item['article'] = article # print(item) ret = self._save(item) if not ret: print("保存失败") self.error_detail.append(link) else: self.error_detail.append(link) def __del__(self): self.sql_pool.dispose()
class CNStock(object): def __init__(self, *args, **kwargs): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", "Referer": "http://news.cnstock.com/news/sns_yw/index.html", } self.headers = headers self.list_url = "http://app.cnstock.com/api/waterfall?" self.extractor = GeneralNewsExtractor() self.local = LOCAL if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) self.table = "cn_stock" self.error_list = [] self.error_detail = [] self.topic = kwargs.get("topic") self.check_date = datetime.datetime.today() - datetime.timedelta( days=1) def make_query_params(self, page): """ 拼接动态请求参数 """ query_params = { # 'colunm': 'qmt-sns_yw', 'colunm': self.topic, 'page': str(page), # 最大 50 页 'num': str(10), 'showstock': str(0), 'callback': 'jQuery{}_{}'.format( ''.join(random.choice(string.digits) for i in range(0, 20)), str(int(time.time() * 1000))), '_': str(int(time.time() * 1000)), } return query_params def get_list(self): for page in range(0, 1000): print(page) params = self.make_query_params(page) url = self.list_url + urlencode(params) # print(url) ret = req.get(url, headers=self.headers).text # print(ret) json_data = re.findall(r'jQuery\d{20}_\d{13}\((\{.*?\})\)', ret)[0] # print(json_data) py_data = json.loads(json_data) # print(py_data) datas = py_data.get("data", {}).get("item") if not datas: break for one in datas: item = dict() pub_date = datetime.datetime.strptime(one.get("time"), "%Y-%m-%d %H:%M:%S") # print(pub_date) if pub_date < self.check_date: print("增量完毕\n") return item['pub_date'] = pub_date item['title'] = one.get("title") item['link'] = one.get("link") yield item def get_detail(self, detail_url): page = req.get(detail_url, headers=self.headers).text result = self.extractor.extract(page) content = result.get("content") return content def contract_sql(self, to_insert): """ 根据待插入字典 拼接出对应的 sql 语句 """ ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self.contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: # logger.warning("重复") return 1 except: traceback.print_exc() logger.warning("失败") else: return count def __del__(self): print("释放资源... ") try: self.sql_pool.dispose() except: pass def start(self): count = 0 for item in self.get_list(): # print(item, type(item)) if item: link = item.get('link') if not link or link == "null": continue item['article'] = self.get_detail(link) # print(item) ret = self._save(item) count += 1 if ret: # print("insert ok ") pass else: self.error_detail.append(item.get("link")) # print("insert fail") if count > 10: self.sql_pool.connection.commit() # print("提交") count = 0
class JuChaoInfo(object): def __init__(self): self.zuixin_url = "http://webapi.cninfo.com.cn//api/sysapi/p_sysapi1128" self.stock_url = "http://webapi.cninfo.com.cn//api/sysapi/p_sysapi1078" self.fund_url = "http://webapi.cninfo.com.cn//api/sysapi/p_sysapi1126" self.datas_url = "http://webapi.cninfo.com.cn//api/sysapi/p_sysapi1127" self.mcode = self._generate_mcode() self.headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Length': '0', 'Cookie': '__qc_wId=726; pgv_pvid=6020356972; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1581945588; codeKey=ce7a9a719b; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b=1582016401', 'Host': 'webapi.cninfo.com.cn', 'mcode': '{}'.format(self.mcode), 'Origin': 'http://webapi.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://webapi.cninfo.com.cn/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } self.local = LOCAL if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) self.table = "juchao_info" self.error_detail = [] def _generate_mcode(self): dt = str(math.floor(time.time())) keyStr = "ABCDEFGHIJKLMNOP" + "QRSTUVWXYZabcdef" + "ghijklmnopqrstuv" + "wxyz0123456789+/" + "=" output = "" i = 0 while i < len(dt): try: chr1 = ord(dt[i]) except IndexError: chr1 = 0 i += 1 try: chr2 = ord(dt[i]) except IndexError: chr2 = 0 i += 1 try: chr3 = ord(dt[i]) except: chr3 = 0 i += 1 enc1 = chr1 >> 2 enc2 = ((chr1 & 3) << 4) | (chr2 >> 4) enc3 = ((chr2 & 15) << 2) | (chr3 >> 6) enc4 = chr3 & 63 if not chr2: enc3 = enc4 = 64 elif not chr3: enc4 = 64 output = output + keyStr[enc1] + keyStr[enc2] + keyStr[ enc3] + keyStr[enc4] return output def _get(self, url): resp = req.post(url, headers=self.headers) # print(resp) if resp.status_code == 200: return resp.text def _contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _save(self, to_insert): try: insert_sql, values = self._contract_sql(to_insert) count = self.sql_pool.insert(insert_sql, values) except pymysql.err.IntegrityError: # logger.warning("重复") return 1 except: logger.warning("失败") else: return count def get_list(self, url): body = self._get(url) # print(body) py_data = json.loads(body) result_code = py_data.get("resultcode") if result_code == 200: records = py_data.get("records") # list for record in records: yield record def __del__(self): self.sql_pool.dispose() def start(self): records = list(self.get_list(self.zuixin_url)) records += list(self.get_list(self.stock_url)) records += list(self.get_list(self.fund_url)) records += list(self.get_list(self.datas_url)) # print(len(records)) num = 0 for record in records: item = dict() pub_date = record.get("DECLAREDATE") if not pub_date: pub_date = record.get("RECTIME") item['pub_date'] = pub_date # 发布时间 item['code'] = record.get("SECCODE") # 证券代码 item['title'] = record.get("F001V") # 资讯标题 item['category'] = record.get("F003V") # 资讯类别 item['summary'] = record.get("F002V") # 资讯摘要 # print(item) count = self._save(item) self.sql_pool.connection.commit() if not count: self.error_detail.append(item) num += 1 if num >= 10: # print("commit") num = 0 self.sql_pool.connection.commit()