コード例 #1
0
ファイル: taoguba.py プロジェクト: xx2life/JustSimpleSpider
class Base(object):
    def __init__(self):
        self.local = LOCAL
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
            'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def _init_pool(self):
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
        self.sql_pool = PyMysqlPoolBase(**conf)

    def _get_proxy(self):
        if self.local:
            return requests.get(LOCAL_PROXY_URL).text.strip()
        else:
            random_num = random.randint(0, 10)
            if random_num % 2:
                time.sleep(1)
                return requests.get(PROXY_URL).text.strip()
            else:
                return requests.get(LOCAL_PROXY_URL).text.strip()

    def get(self, url):
        count = 0
        while True:
            count += 1
            if count > 10:
                return None
            try:
                proxy = {"proxy": self._get_proxy()}
                print("proxy is >> {}".format(proxy))
                resp = requests.get(url, headers=self.headers, proxies=proxy)
            except:
                traceback.print_exc()
                time.sleep(0.5)
            else:
                if resp.status_code == 200:
                    return resp
                elif resp.status_code == 404:
                    return None
                else:
                    print("status_code: >> {}".format(resp.status_code))
                    time.sleep(1)
                    pass

    def convert_dt(self, time_stamp):
        d = str(datetime.datetime.fromtimestamp(time_stamp))
        return d

    def _contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        ks = sorted(ks)
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}` '''.format(
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        content = "".join(params).strip()
        return content

    def _get_values(self, item: dict):
        # self.fields: []  插入所需字段列表 同时与上文的 ks = sorted(ks) 对应
        value = tuple(item.get(field) for field in sorted(self.fields))
        return value

    def _save(self, item):
        insert_sql = self._contract_sql(item)
        value = self._get_values(item)
        try:
            ret = self.sql_pool.insert(insert_sql, value)
        except pymysql.err.IntegrityError:
            print("重复数据 ")
            return 1
        except:
            traceback.print_exc()
        else:
            return ret

    def _save_many(self, items):
        values = [self._get_values(item) for item in items]  # list of tuple
        insert_many_sql = self._contract_sql(items[0])
        try:
            ret = self.sql_pool.insert_many(insert_many_sql, values)
        except pymysql.err.IntegrityError:
            print("批量中有重复数据")
        except:
            traceback.print_exc()
        else:
            return ret
        finally:
            self.sql_pool.end()

    def save_one(self, item):
        self._save(item)
        self.sql_pool.end()

    def save(self, items):
        ret = self._save_many(items)
        if not ret:
            print("批量保存失败 开始单独保存 .. ")
            count = 0
            for item in items:
                print(item)
                self._save(item)
                count += 1
                if count > 9:
                    self.sql_pool.end()
                    count = 0
            # self.sql_pool.dispose()
            self.sql_pool.end()
        else:
            print("批量成功..")
            print(items)
            print(len(items))

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        try:
            self._init_pool()
            self._start()
        except:
            traceback.print_exc()
コード例 #2
0
ファイル: base_stcn.py プロジェクト: xx2life/JustSimpleSpider
class STCN_Base(object):
    def __init__(self):
        self.table = "stcn_info"
        self.local = LOCAL
        self.check_dt = datetime.datetime.today() - datetime.timedelta(days=2)
        self.dt_fmt = '%Y-%m-%d'

        # if self.local:
        #     conf = {
        #         "host": LOCAL_MYSQL_HOST,
        #         "port": LOCAL_MYSQL_PORT,
        #         "user": LOCAL_MYSQL_USER,
        #         "password": LOCAL_MYSQL_PASSWORD,
        #         "db": LOCAL_MYSQL_DB,
        #     }
        # else:
        #     conf = {
        #         "host": MYSQL_HOST,
        #         "port": MYSQL_PORT,
        #         "user": MYSQL_USER,
        #         "password": MYSQL_PASSWORD,
        #         "db": MYSQL_DB,
        #     }
        # self.sql_pool = PyMysqlPoolBase(**conf)

        # 默认是不需要翻页的
        self.pages = False
        self.extractor = GeneralNewsExtractor()

    def _init_pool(self):
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
        self.sql_pool = PyMysqlPoolBase(**conf)

    def _get(self, url):
        resp = requests.get(url)
        if resp.status_code == 200:
            return resp.text

    def _extract_content(self, body):
        result = self.extractor.extract(body)
        content = result.get("content")
        return content

    def _parse_detail(self, body):
        try:
            doc = html.fromstring(body)
            node = doc.xpath("//div[@class='txt_con']")[0]
            content = node.text_content()
        except:
            content = None
        else:
            return content
        if not content:
            content = self._extract_content(body)
            return content

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        content = "".join(params).strip()
        return content

    def _contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        ks = sorted(ks)  # article,link,pub_date,title
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}` '''.format(
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        # return base_sql, tuple(vs)
        return base_sql

    def _save(self, item):
        insert_sql = self._contract_sql(item)
        # print(insert_sql)
        value = (item.get("article"), item.get("link"), item.get("pub_date"),
                 item.get("title"))
        # print(value)
        try:
            ret = self.sql_pool.insert(insert_sql, value)
        except pymysql.err.IntegrityError:
            # print("重复数据 ")
            return 1
        except:
            traceback.print_exc()
        else:
            return ret

    def _save_many(self, items):
        values = [(item.get("article"), item.get("link"), item.get("pub_date"),
                   item.get("title")) for item in items]
        insert_many_sql = self._contract_sql(items[0])
        try:
            ret = self.sql_pool.insert_many(insert_many_sql, values)
        except pymysql.err.IntegrityError:
            print("批量中有重复数据")
        except:
            traceback.print_exc()
        else:
            return ret
        finally:
            self.sql_pool.end()

    def _add_article(self, item: dict):
        link = item.get("link")
        if link:
            detail_page = self._get(link)
            if detail_page:
                article = self._parse_detail(detail_page)
                if article:
                    item['article'] = article
                    return True
        return False

    def _check_dt(self, pub_dt):
        if not pub_dt:
            return False

        try:
            pub_dt = datetime.datetime.strptime(pub_dt[:10], self.dt_fmt)
        except:
            print("截取增量时间点失败.. 重新爬取.. ")
            # traceback.print_exc()
            return False

        if pub_dt < self.check_dt:
            print("当前天: ", pub_dt)
            print("检查时刻: ", self.check_dt)
            print("增量结束 .. ")
            return True
        else:
            return False

    def _start(self):
        self._init_pool()
        if not self.pages:
            list_body = self._get(self.list_url)
            if list_body:
                items = self._parse_list_body(list_body)
                count = 0
                for item in items:
                    if self._check_dt(item.get("pub_date")):
                        self.sql_pool.end()
                        return
                    ret = self._save(item)
                    if ret:
                        count += 1
                        # print("保存成功: {}".format(item))
                    else:
                        # print("保存失败: {}".format(item))
                        pass
                    if count > 9:
                        self.sql_pool.end()
                        print("提交 .. ")
                        count = 0
                self.sql_pool.dispose()
        else:
            count = 0
            for page in range(1, self.page_num + 1):
                print("\nThe page is {}".format(page))
                list_url = self.format_url.format(page)
                print(list_url)
                list_body = self._get(list_url)
                if list_body:
                    items = self._parse_list_body(list_body)
                    for item in items:
                        if self._check_dt(item.get("pub_date")):
                            self.sql_pool.end()
                            return
                        ret = self._save(item)
                        if ret:
                            count += 1
                            # print("保存成功: {}".format(item))
                        else:
                            # print("保存失败: {}".format(item))
                            pass
                        if count > 9:
                            self.sql_pool.end()
                            print("提交 .. ")
                            count = 0

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        print("{} 开始爬取".format(self.name))
        try:
            self._start()
        except:
            traceback.print_exc()
            print("{} 爬取失败".format(self.name))
コード例 #3
0
ファイル: cbase.py プロジェクト: xx2life/JustSimpleSpider
class CArticleBase(object):
    # 东财-财富号的基类
    def __init__(self, key):
        self.local = LOCAL
        self.key = key
        print(self.key, "\n\n\n")
        self.start_url = 'http://api.so.eastmoney.com/bussiness/Web/GetSearchList?'
        self.page_size = 10
        self.headers = {
            "Referer": "http://so.eastmoney.com/CArticle/s?keyword={}".format(self.key.encode()),
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
        }
        self.table = "eastmoney_carticle"
        self.error_detail = []
        self.error_list = []
        self.proxy = self._get_proxy()
        self.dt_format = '%Y-%m-%d %H:%M:%S'
        self.limit_time = datetime.datetime(2020, 2, 1)
        self.use_proxy = 1

    def _init_pool(self):
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
            self.db = LOCAL_MYSQL_DB
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
            self.db = MYSQL_DB
        self.sql_pool = PyMysqlPoolBase(**conf)

    def make_query_params(self, msg, page):
        query_params = {
            'type': '8224',  # 该参数表明按时间排序
            'pageindex': str(page),
            'pagesize': str(self.page_size),
            'keyword': msg,
            'name': 'caifuhaowenzhang',
            'cb': 'jQuery{}_{}'.format(
                ''.join(random.choice(string.digits) for i in range(0, 21)),
                str(int(time.time() * 1000))
            ),
            '_': str(int(time.time() * 1000)),
        }
        return query_params

    def contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}`.`{}` '''.format(
            self.db, self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql, tuple(vs)

    def _save(self, to_insert):
        try:
            insert_sql, values = self.contract_sql(to_insert)
            count = self.sql_pool.insert(insert_sql, values)
        except pymysql.err.IntegrityError:
            # print("重复 ")
            return 1
        except:
            print("失败")
            traceback.print_exc()
            return
        else:
            return count

    def _get_proxy(self):
        if self.local:
            return requests.get(LOCAL_PROXY_URL).text.strip()
        else:
            # 为了不轻易崩 ip 线上的正式环境 混用 ..
            random_num = random.randint(0, 10)
            if random_num % 2:
                time.sleep(1)
                return requests.get(PROXY_URL).text.strip()
            else:
                return requests.get(LOCAL_PROXY_URL).text.strip()

    def _delete_detail_404(self, url):
        delete_sql = f"delete from `{self.table}` where link = {url};"
        ret = self.sql_pool.delete(delete_sql)
        self.sql_pool.end()
        if ret:
            print(f"删除无效的 url: {url}")

    def _crawl(self, url, proxy):
        proxies = {'http': proxy}
        r = requests.get(url, proxies=proxies, headers=self.headers, timeout=3)
        return r

    def _get(self, url):
        if self.use_proxy:
            count = 0
            while True:
                count = count + 1
                try:
                    resp = self._crawl(url, self.proxy)
                    if resp.status_code == 200:
                        return resp
                    elif resp.status_code == 404:
                        self._delete_detail_404(url)
                        return None
                    elif count > 2:
                        print(f'抓取网页{url}最终失败')
                        break
                    else:
                        self.proxy = self._get_proxy()
                        print(f"无效状态码{resp.status_code}, 更换代理{self.proxy}\n")
                except:
                    self.proxy = self._get_proxy()
                    print(f'代理失败,更换代理{self.proxy} \n')
        else:
            try:
                resp = requests.get(url)
            except:
                return
            return resp

    def _parse_detail(self, detail_page):
        doc = html.fromstring(detail_page)
        article_body = doc.xpath('//div[@class="article-body"]/*')
        contents = []
        for p_node in article_body:
            children = p_node.getchildren()
            children_tags = [child.tag for child in children]
            if children_tags and "img" in children_tags:
                img_links = p_node.xpath("./img/@src")  # list
                contents.append(",".join(img_links))
            else:
                contents.append(p_node.text_content())
        contents = "\r\n".join(contents)
        return contents

    def transferContent(self, content):
        if content is None:
            return None
        else:
            string = ""
            for c in content:
                if c == '"':
                    string += '\\\"'
                elif c == "'":
                    string += "\\\'"
                elif c == "\\":
                    string += "\\\\"
                else:
                    string += c
            return string

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in ['\n', '\r', '\t',
                    '\u200a', '\u200b', '\u200c', '\u200d', '\u200e',
                    '\u202a', '\u202b', '\u202c', '\u202d', '\u202e',
                    ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        return "".join(params)

    def _get_list(self, list_url):
        resp = self._get(list_url)
        if resp:
            return resp.text
        else:
            self.error_list.append(list_url)

    def _get_detail(self, detail_url):
        resp = self._get(detail_url)
        if resp:
            return resp.text
        else:
            self.error_detail.append(detail_url)

    def _parse_list(self, list_page):
        try:
            json_data = re.findall(r'jQuery\d{21}_\d{13}\((\{.*?\})\)', list_page)[0]
            list_data = json.loads(json_data).get("Data")
        except:
            return None
        else:
            if list_data:
                return list_data
            else:
                return []

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def close(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        try:
            self._start()
        except:
            traceback.print_exc()
        finally:
            self.close()

    def _start(self):
        # 本类是针对某一个具体的 code 来进行爬取的
        # 所以在  _start 之外还会有一个总的 "调度函数"

        self._init_pool()

        # (1) 生成 list_url
        for page in range(1, 2):
            # print(page)
            list_url = self.start_url + urlencode(self.make_query_params(self.key, page))
            # print(list_url)

            # (2) 获取列表页
            list_page = self._get_list(list_url)
            # print(list_page)

            # (3) 从列表页解析数据 返回列表
            list_infos = self._parse_list(list_page)
            # print(pprint.pformat(list_infos))

            if list_infos:
                # # 增量的过程中不再继续爬取
                # show_times = [datetime.datetime.strptime(info.get("ShowTime"), self.dt_format) for info in list_infos]
                # # print(show_times)
                # if max(show_times) < self.limit_time:
                #     print("增量完毕")
                #     return

                count = 0
                # (4) 解析详情页 保存数据
                for data in list_infos:
                    item = dict()
                    item['code'] = self.key
                    link = data.get("ArticleUrl")
                    item['link'] = link
                    item['title'] = data.get("Title")
                    item['pub_date'] = data.get("ShowTime")
                    detail_page = self._get_detail(link)
                    if detail_page:
                        article = self._parse_detail(detail_page)
                        # 对文章进行处理 以防插入失败..
                        article = self._process_content(article)
                        item['article'] = article
                        print("item", item)
                        ret = self._save(item)
                        if not ret:
                            print(f"插入失败 {item.get('link')}")
                        else:
                            count += 1
                            if count > 10:
                                self.sql_pool.end()
                                count = 0
                self.sql_pool.end()  # self.sql_pool.connection.commit()
                print(f"第{page}页保存成功")
        self.close()
コード例 #4
0
class Money163(object):
    def __init__(self):
        self.list_url = "http://money.163.com/special/00251G8F/news_json.js"
        self.extractor = GeneralNewsExtractor()
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
        }
        self.local = LOCAL
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
            self.db = LOCAL_MYSQL_DB
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
            self.db = MYSQL_DB

        self.sql_pool = PyMysqlPoolBase(**conf)
        self.table = "netease_money"
        self.error_detail = []

    def _parse_list(self, body):
        js_obj = re.findall(r"news:(.*)\};", body)[0]
        py_obj = demjson.decode(js_obj)
        for type in py_obj:  # 得到每一个子主题
            for data in type:
                yield data

    def _parse_detail(self, detail_url):
        try:
            page = requests.get(detail_url, headers=self.headers).text
            result = self.extractor.extract(page)
            content = result.get("content")
        except:
            return
        return content

    def contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}`.`{}` '''.format(
            self.db,
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql, tuple(vs)

    def _save(self, to_insert):
        try:
            insert_sql, values = self.contract_sql(to_insert)
            count = self.sql_pool.insert(insert_sql, values)
        except pymysql.err.IntegrityError:
            # print("重复")
            return 1
        except:
            print("失败")
        else:
            return count

    def get_proxy(self):
        if self.local:
            return requests.get(LOCAL_PROXY_URL).text.strip()
        else:
            return requests.get(PROXY_URL).text.strip()

    def get_list_resp(self):
        count = 0
        while True:
            proxy = self.get_proxy()
            print(">> ", proxy)
            try:
                list_resp = requests.get(self.list_url,
                                         proxies={"http": proxy},
                                         timeout=3)
            except:
                count += 1
                if count > 10:
                    return
                time.sleep(1)
            else:
                if list_resp.status_code != 200:
                    count += 1
                    if count > 10:
                        return
                    time.sleep(1)
                else:
                    break
        return list_resp

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def close(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        try:
            self._start()
        except:
            traceback.print_exc()
        finally:
            self.close()

    def _start(self):
        list_resp = self.get_list_resp()
        print(">>>", list_resp)
        if list_resp and list_resp.status_code == 200:
            body = list_resp.text
            # TODO 如果不转为 list,直接使用生成器时插入数据库会失败..
            ret = list(self._parse_list(body))
            count = 0
            for one in ret:
                # print(one)
                item = dict()
                link = one.get("l")
                item['link'] = link
                item['title'] = one.get("t")

                # 在返回的 json 数据中 最新的数据在最前面 定时+增量 只需要爬取大于当前时间一天之前的新闻
                # 保险起见 设置为 2
                # dt = datetime.datetime.today() - datetime.timedelta(days=1)
                pub_date = one.get("p")
                # pt = datetime.datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S")

                # bug fixed 因为这里是不同的栏目穿插 所以这么判断会少数据
                # if pt < dt:
                #     print(pt)
                #     print(dt)
                #     print('网易财经增量完毕 ')
                #     return

                item['pub_date'] = pub_date
                article = self._parse_detail(one.get("l"))

                if article:
                    item['article'] = article
                    # print(item.get("title"))
                    ret = self._save(item)
                    if not ret:
                        print("保存失败 ")
                        self.error_detail.append(link)
                    else:
                        count += 1
                else:
                    self.error_detail.append(link)

                if count > 9:
                    print("提交.. ")
                    self.sql_pool.end()
                    count = 0
        self.sql_pool.dispose()
コード例 #5
0
class CArticleLoder(object):
    def __init__(self, key):
        # 本地运行亦或者是在服务器上运行
        self.local = LOCAL
        # 是否使用阿布云代理
        self.abu = False
        # 股票代码中文简称
        self.key = key
        self.start_url = 'http://api.so.eastmoney.com/bussiness/Web/GetSearchList?'
        self.page_size = 10
        self.headers = {
            "Referer":
            "http://so.eastmoney.com/CArticle/s?keyword={}".format(
                self.key.encode()),
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
        }
        self.db = MYSQL_DB
        self.table = "eastmoney_carticle"
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
        self.sql_pool = PyMysqlPoolBase(**conf)
        # 不使用阿布云的情况下 初始化代理
        if not self.abu:
            self.proxy = self._get_proxy()
        # 记录出错的列表页 以及 详情页 url
        self.error_detail = []
        self.error_list = []

    def make_query_params(self, msg, page):
        query_params = {
            'type':
            '8224',  # 该参数表明按时间排序
            'pageindex':
            str(page),
            'pagesize':
            str(self.page_size),
            'keyword':
            msg,
            'name':
            'caifuhaowenzhang',
            'cb':
            'jQuery{}_{}'.format(
                ''.join(random.choice(string.digits) for i in range(0, 21)),
                str(int(time.time() * 1000))),
            '_':
            str(int(time.time() * 1000)),
        }
        return query_params

    def contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}`.`{}` '''.format(
            self.db,
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql, tuple(vs)

    def _save(self, to_insert):
        try:
            insert_sql, values = self.contract_sql(to_insert)
            count = self.sql_pool.insert(insert_sql, values)
        except pymysql.err.IntegrityError:
            logger.warning("重复 ")
        except:
            logger.warning("失败")
        else:
            return count

    def _abu_get(self, url):
        """使用阿布云代理 默认失败后重新发起请求"""
        proxy_host = "http-cla.abuyun.com"
        proxy_port = 9030
        # 代理隧道验证信息
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass,
        }
        proxies = {
            "http": proxy_meta,
            "https": proxy_meta,
        }
        retry = 2  # 重试三次 事不过三^_^
        while True:
            try:
                resp = requests.get(
                    url,
                    proxies=proxies,
                    headers=self.headers,
                    timeout=3,
                )
                if resp.status_code == 200:
                    return resp
                else:
                    print(resp.status_code, "retry")
                    retry -= 1
                    if retry <= 0:
                        return None
                    time.sleep(3)
            except:
                print("error retry")
                retry -= 1
                if retry <= 0:
                    return None
                time.sleep(3)

    # def _get_proxy(self):
    #     if self.local:
    #         r = requests.get('http://192.168.0.102:8888/get')
    #     else:
    #         r = requests.get('http://172.17.0.4:8888/get')
    #     proxy = r.text
    #     return proxy

    def _get_proxy(self):
        if self.local:
            return requests.get(LOCAL_PROXY_URL).text.strip()
        else:
            return requests.get(PROXY_URL).text.strip()

    # def _get_proxy(self):
    #     # 获取一个可用代理 如果当前没有可用的话 就 sleep 3 秒钟
    #     if self.local:
    #         while True:
    #             count = requests.get(LOCAL_PROXY_URL.format("count"))
    #             if count:
    #                 resp = requests.get(LOCAL_PROXY_URL.format("get"))
    #                 break
    #             else:
    #                 print("当前无可用代理, 等一会儿 ")
    #                 time.sleep(3)
    #         return resp.text
    #     else:
    #         while True:
    #             count = requests.get(PROXY_URL.format("count"))
    #             if count:
    #                 resp = requests.get(PROXY_URL.format("get"))
    #                 break
    #             else:
    #                 print("当前无可用代理, 等一会儿 ")
    #                 time.sleep(3)
    #         return resp.text

    def _delete_detail_404(self, url):
        delete_sql = f"delete from `{self.table}` where link = {url};"
        ret = self.sql_pool.delete(delete_sql)
        self.sql_pool.end()
        if ret:
            print(f"删除无效的 url: {url}")

    def _crawl(self, url, proxy):
        proxies = {'http': proxy}
        r = requests.get(url, proxies=proxies, headers=self.headers, timeout=3)
        return r

    def _get(self, url):
        if self.abu:
            return self._abu_get(url)

        count = 0
        while True:
            count = count + 1
            try:
                resp = self._crawl(url, self.proxy)
                if resp.status_code == 200:
                    return resp
                elif resp.status_code == 404:
                    self._delete_detail_404(url)
                    return None
                elif count > 2:
                    logger.warning(f'抓取网页{url}最终失败')
                    break
                else:
                    self.proxy = self._get_proxy()
                    logger.warning(
                        f"无效状态码{resp.status_code}, 更换代理{self.proxy}\n")
            except:
                self.proxy = self._get_proxy()
                logger.warning(f'代理失败,更换代理{self.proxy} \n')

    def _parse_detail(self, detail_page):
        doc = html.fromstring(detail_page)
        article_body = doc.xpath('//div[@class="article-body"]/*')
        contents = []
        for p_node in article_body:
            children = p_node.getchildren()
            children_tags = [child.tag for child in children]
            if children_tags and "img" in children_tags:
                img_links = p_node.xpath("./img/@src")  # list
                contents.append(",".join(img_links))
            else:
                contents.append(p_node.text_content())
        contents = "\r\n".join(contents)
        return contents

    def _select_key_links(self):
        select_all_sql = f"select link from {self.table} where code = '{self.key}' and article is NULL;"
        # links = self.sql_pool.select_many(select_all_sql, size=10)
        links = self.sql_pool.select_all(select_all_sql)
        return links

    def _select_rest_all_links(self):
        select_all_sql = f"select id, link from {self.table} where article is NULL;"
        # links = self.sql_pool.select_many(select_all_sql, size=20)
        links = self.sql_pool.select_all(select_all_sql)
        return links

    def transferContent(self, content):
        if content is None:
            return None
        else:
            string = ""
            for c in content:
                if c == '"':
                    string += '\\\"'
                elif c == "'":
                    string += "\\\'"
                elif c == "\\":
                    string += "\\\\"
                else:
                    string += c
            return string

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        return "".join(params)

    def _update_detail(self, link, article):
        # 直接插入文本内容可能出错 需对其进行处理
        # article = self.transferContent(article)
        article = self._process_content(article)
        print("文章内容是: \n", article)
        update_sql = f"update {self.table} set article =%s where link =%s;"
        try:
            ret = self.sql_pool.update(update_sql, [(article), (link)])
            # ret = self.sql_pool.update(update_sql)
        except:
            traceback.print_exc()
            print("插入失败")
            return None
        else:
            return ret

    def _get_list(self, list_url):
        resp = self._get(list_url)
        if resp:
            return resp.text
        else:
            self.error_list.append(list_url)

    def _get_detail(self, detail_url):
        resp = self._get(detail_url)
        if resp:
            return resp.text
        else:
            self.error_detail.append(detail_url)

    def _parse_list(self, list_page):
        try:
            json_data = re.findall(r'jQuery\d{21}_\d{13}\((\{.*?\})\)',
                                   list_page)[0]
            list_data = json.loads(json_data).get("Data")
        except:
            return None
        else:
            if list_data:
                return list_data
            else:
                return []

    def _save_one_page_list(self, page):
        list_url = self.start_url + urlencode(
            self.make_query_params(self.key, page))
        list_page = self._get_list(list_url)
        if list_page:
            list_infos = self._parse_list(list_page)  # list
            if not list_infos:
                logger.info(f"{self.key} 爬取完毕 ")
                return

            for data in list_infos:
                item = dict()
                item['code'] = self.key
                link = data.get("ArticleUrl")
                item['link'] = link
                item['title'] = data.get("Title")
                item['pub_date'] = data.get("ShowTime")
                print("item", item)
                ret = self._save(item)
                if not ret:
                    logger.warning(f"插入失败 {item}")
            self.sql_pool.end()  # self.sql_pool.connection.commit()
            print(f"第{page}页保存成功")
            return page

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass