Exemple #1
0
    def _get_tags(self, article_info):
        '''
        获取tags
        :return:
        '''
        tmp_tags = list_duplicate_remove(
            [str(item.get('name', '')) for item in article_info.get('noteInfo', {}).get('relatedTags', [])])
        # self.my_lg.info(str(tmp_tags))
        # list先转str, 去掉敏感字眼, 再转list, 并去除''元素, 得到最后list
        tmp_tags = delete_list_null_str(self.wash_sensitive_info('|'.join(tmp_tags)).split('|'))
        tags = [{  # tags可以为空list!
            'keyword': item,
        } for item in tmp_tags]

        return tags
Exemple #2
0
def _get_66_ip_list():
    '''
    先获取66高匿名ip
    :return:
    '''
    global a_66_ip
    headers = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'http://www.66ip.cn/nm.html',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }

    params = (
        ('getnum', ''),
        ('isp', '0'),
        ('anonymoustype', '3'),
        ('start', ''),
        ('ports', ''),
        ('export', ''),
        ('ipaddress', ''),
        ('area', '0'),
        ('proxytype', '2'),
        ('api', '66ip'),
    )

    response = requests.get('http://www.66ip.cn/nmtq.php',
                            headers=headers,
                            params=params,
                            cookies=None)
    body = Requests._wash_html(response.content.decode('gbk'))
    try:
        part = re.compile(r'</script>(.*)</div>').findall(body)[0]
    except IndexError:
        part = ''
    part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part)
    # print(part)
    ip_list = delete_list_null_str(part.split('<br />'))
    # print(ip_list)
    a_66_ip = ip_list if ip_list != [] else []

    return ip_list
Exemple #3
0
def _get_66_ip_list():
    '''
    先获取66高匿名ip
    :return:
    '''
    global ori_ip_list
    params = (
        ('getnum', ''),
        ('isp', '0'),
        ('anonymoustype', '3'),
        ('start', ''),
        ('ports', ''),
        ('export', ''),
        ('ipaddress', ''),
        ('area', '0'),
        ('proxytype', '2'),
        ('api', '66ip'),
    )

    with session() as s:
        try:
            response = s.get('http://www.66ip.cn/nmtq.php',
                             headers=_get_base_headers(),
                             params=params,
                             cookies=None)
        except Exception:
            return []

        body = Requests._wash_html(response.content.decode('gbk'))
    try:
        part = re.compile(r'</script>(.*)</div>').findall(body)[0]
    except IndexError:
        part = ''
    part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part)
    # print(part)
    ip_list = delete_list_null_str(part.split('<br />'))
    # print(ip_list)
    ori_ip_list = ip_list if ip_list != [] else []

    return ip_list
Exemple #4
0
def get_start_up_ip_list(url):
    '''
    初始抓取时调用
    :param url:
    :return:
    '''
    with session() as s:
        body = s.get(url, headers=_get_base_headers()).text

    if body == '':
        return []
    tmp_ip_list = delete_list_null_str(body.split('\r\n'))

    ip_list = []
    for item in tmp_ip_list:
        try:
            tmp = re.compile('\d+\.\d+\.\d+\.\d+:\d+').findall(item)[0]
            ip_list.append(tmp)
        except IndexError:
            continue

    return ip_list