Esempio n. 1
0
 def clean_title(self, data, url):
     '''清洗标题,清除多余符号'''
     data_title = data['title']
     if data_title:
         real_title = re.sub('\\r|\\n|\\t', '', data_title).strip()
     else:
         real_title = ''
         log.error(f"获取该-{url}的详情页的标题为空")
     return real_title
Esempio n. 2
0
 def get_conthtml(self, data, url):
     '''拿取详情源码'''
     conthtml = data['conthtml']
     if conthtml:
         conthtml = re.sub(r'\\', '', conthtml)
     else:
         conthtml = ''
         log.error(f"获取该-{url}的详情源码为空")
     return conthtml
Esempio n. 3
0
 def run(self):
     '''运行'''
     url = self.args['url']
     data = self.get_articel_detail(url)
     if data:
         new_data = self.new_data(data, url)
         return new_data, self.error_count
     else:
         log.error('没有正确获得data')
         return data, self.error_count
Esempio n. 4
0
 def clean_content(self, data, url, conthtml):
     '''清洗正文内容'''
     data_content = data['content']
     if data_content:
         real_content = re.sub('\\n', '', data_content)
     else:
         real_content = ''
         log.error(f"获取该-{url}的详情页的内容为空")
     last_real_content = clear_source(conthtml, url, real_content)
     return last_real_content
Esempio n. 5
0
 def clean_time(self, data, url):
     '''清洗时间,获得正确格式的日期和时间戳'''
     data_time = data['publish_time']
     if data_time:
         time_date, timestamp = Clean_time().clean_time(data_time)
     else:
         time_date = ''
         timestamp = ''
         log.error(f"获取该-{url}的详情页的时间为空")
     return time_date, timestamp
Esempio n. 6
0
 def get_article_html(self, url):
     '''获得编码格式正确后的源码'''
     headers = {"User-Agent": self.UA}
     try:
         res = requests.get(url=url,
                            headers=headers,
                            verify=False,
                            timeout=60)
         s = requests.session()
         s.keep_alive = False  # 关闭多余连接
         if res.status_code == 200:
             try:
                 result = res.json()
                 log.info(f"该url-{url}源码为json格式")
                 return
             except:
                 charset = re.search('<meta.*?charset(.*?)>', res.text)
                 if charset:
                     charset = re.sub('"|=|/', '', charset.group(1)).lower()
                     for html_encode in html_encode_list:
                         if html_encode in charset:
                             charset = html_encode
                     if charset == 'unicode':
                         charset = 'unicode_escape'
                     if 'huaxia' in url:
                         charset = 'gb2312'
                     result = res.content.decode(charset, 'ignore')
                 else:
                     try:
                         result = res.content.decode()
                     except:
                         result = res.text
         else:
             self.error_count += 1
             result = ""
             log.error(f"请求该url-{url}的详情页出错,状态码-{res.status_code}")
     except Exception as e:
         self.error_count += 1
         result = ""
         log.error(f'访问该url-{url}失败-原因-{str(e)}')
     return result
Esempio n. 7
0
 def _test_token(self, new_token):
     """
     测试token
     :param new_token:
     :return:
     """
     headers = {
         'Connection': 'close',
         'Authorization': new_token,
     }
     try:
         resp = requests.get(url=self.token_test_api, headers=headers, timeout=5)
         # 如果状态码401则token失效
         if resp.status_code == 401:
             self.flag = True
         # token可用
         else:
             self.flag = False
     # 验证过程中异常则视为不可用
     except Exception as e:
         log.error('测试token失败...{}'.format(e))
         self.flag = True
Esempio n. 8
0
    def new_data(self, data, url):
        '''重新拼凑清洗后的数据'''
        real_title = self.clean_title(data, url)
        time_date, timestamp = self.clean_time(data, url)
        conthtml = self.get_conthtml(data, url)
        real_content = self.clean_content(data, url, conthtml)
        add_timestamp = int(time.time()) * 1000
        if real_title and time_date and real_content:
            new_data = {
                "ID": get_md5(url),
                'Title': real_title,
                'Content': real_content,
                'AddOn': add_timestamp,
                'Time': timestamp,
                "Url": url,
                'Language': 2052,
                'ContentSource': conthtml,
            }
            send_data_dict = self.post_data_dict_format(
                page_data_dict=new_data)
        else:
            send_data_dict = {
                "errors": {
                    'title':
                    real_title or f'没有获得文章-{url}-标题,为空',
                    'time':
                    time_date or f'没有获得文章-{url}-时间,为空',
                    'content':
                    real_content.encode("gbk", "ignore").decode(
                        "gbk", "ignore") or f'没有获得文章-{url}-内容,为空'
                }
            }
            self.error_count += 1
            log.error(f"拼接新的data错误为空")

        # log.info(f'链接--{url}--拼接后的内容--{send_data_dict}')
        return send_data_dict
Esempio n. 9
0
def get_ADSL_proxy():
    # 不使用代理 直接放回 {}
    # return {}
    # time.sleep(5)
    token_gen = TokenGen(client_id=TOKEN_CLIENT_ID, client_secret=TOKEN_CLIENT_SECRET)
    proxies = {}
    try:
        headers = {
            'Connection': 'close',
            'Authorization': token_gen.get_token(),
        }
        res = requests.get(url=ADSL_APIURL, headers=headers, timeout=15)
        ip = res.json().get('IP')
        port = res.json().get('Port')
        if ip and port:
            proxy = 'http://{}:{}@{}:{}'.format(ADSL_USERNAME, ADSL_PASSWORD, ip, port)
            proxies = {
                "http": proxy,
                "https": proxy,
            }
            # log.info('获取代理成功...{}'.format(proxies))
    except Exception as e:
        log.error('获取代理失败...{}'.format(e))
    return proxies