Example #1
0
def crawl_qy_dai_li(page_count=2):
    """
    获取旗云代理http://www.qydaili.com/free/?action=china&page=1
    :param page_count:
    :return:
    """
    start_url = "http://www.qydaili.com/free/?action=china&page={}"
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    qy_ip_list = []
    for url in urls:
        headers = {
            "Host": "www.qydaili.com",
            "Upgrade-Insecure-Requests": "1",
        }
        response_html = get_text(url, options=headers)
        if response_html:
            tree = etree.HTML(response_html)
            tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
            for tr in tr_list:
                try:
                    # proxy_type = tr.xpath("./td[4]/text()")[0].lower()
                    ip = tr.xpath("./td[1]/text()")[0]
                    port = tr.xpath("./td[2]/text()")[0]
                    q.put((ip, port))
                    qy_ip_list.append((ip, port))
                except:
                    continue
    return qy_ip_list
Example #2
0
def crawl_3366_dai_li(page_count=5, stype='1'):
    """
    获取云代理http://www.ip3366.net/free/?stype=1&page=3
    :param page_count:
    :return:
    """
    start_url = "http://www.ip3366.net/free/?stype%s=1&page={}" % stype
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    ip_3366_list = []
    for url in urls:
        headers = {
            "Host": "www.ip3366.net",
            "Upgrade-Insecure-Requests": "1",
        }
        response_html = get_text(url, options=headers)
        if response_html:
            tree = etree.HTML(response_html)
            tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
            for tr in tr_list:
                # proxy_type = tr.xpath("./td[4]/text()")[0].lower()
                ip = tr.xpath("./td[1]/text()")[0]
                port = tr.xpath("./td[2]/text()")[0]
                q.put((ip, port))
                ip_3366_list.append((ip, port))
    return ip_3366_list
Example #3
0
def crawl_66ip(page_count=2):
    """
    爬取http://www.66ip.cn/{}.html
    """
    print(str(datetime.datetime.now()) + ' 开始爬取66ip......')
    start_url = 'http://www.66ip.cn/{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    sixsix_ip_list = []
    for url in urls:
        headers = {
            "Referer": url,
            "Host": "www.66ip.cn",
            "Upgrade-Insecure-Requests": "1",
        }
        response_html = get_text(url, options=headers)
        if response_html:
            pattern = re.compile(
                "<tr><td>(\d+.\d+.\d+.\d+)</td><td>(\d+)</td>", re.S)
            proxy_list = re.findall(pattern, response_html)
            for proxy in proxy_list:
                ip = proxy[0]
                port = proxy[1]
                ip = ip.strip()
                port = port.strip()
                q.put((ip, port))
                sixsix_ip_list.append((ip, port))
    return sixsix_ip_list
Example #4
0
def crawl_89ip(page_count=2):
    """
    获取 http://www.89ip.cn/index.html 免费代理
    :param page_count:
    :return:
    """
    start_url = 'http://www.89ip.cn/index_{}.html'
    urls = [start_url.format(page) for page in range(1, page_count + 1)]
    p89_ip_list = []
    for url in urls:
        headers = {
            "Referer": url,
            "Host": "www.89ip.cn",
            "Upgrade-Insecure-Requests": "1",
        }
        response_html = get_text(url, options=headers)
        if response_html:
            tree = etree.HTML(response_html)
            tr_list = tree.xpath('//table[@class="layui-table"]/tbody/tr')
            for tr in tr_list:
                ip = tr.xpath("./td[1]/text()")[0].replace('\n', '').replace('\t', '')
                port = tr.xpath("./td[2]/text()")[0].replace('\n', '').replace('\t', '')
                q.put((ip, port))
                p89_ip_list.append((ip, port))
    return p89_ip_list
Example #5
0
def get_word_frequency(project=""):
    wfrq = defaultdict(int)
    for file in walk_dir(project):
        raw_text = get_text(file, get_all=True)
        clear_text = clean_text(raw_text)
        words = parse_words(clear_text)
        for word in words:
            wfrq[word.lower()] += 1
    proj_name = get_project_name(project)
    data = simplejson.dumps(wfrq,
                            indent=4,
                            item_sort_key=lambda i: (-i[1], i[0]))
    with open(freq_get_file_name(proj_name), "w") as f:
        f.write(data)
    return wfrq
Example #6
0
def get_all_words(file, level=2):
    """
    :param file: the target file to read
    :param level: return word level,
    2: code logging and error message
    3: 2 and code comment
    4: 3 and project document, markdown file etc.
    :return: list of words
    """
    raw_text = get_text(file, level=level)
    if not raw_text:
        return
    # print(file)
    # print("raw text:", raw_text)
    clear_text = clean_text(raw_text)
    # print("clean text:", clear_text)
    words = parse_words(clear_text)
    # print("words:", words)
    return words
Example #7
0
def crawl_highanon():
    """
    http://www.proxylists.net/http_highanon.txt
    :return:
    """
    url = 'http://www.proxylists.net/http_highanon.txt'
    response_html = get_text(url=url)
    hig_ip_list = []
    if response_html:
        tem = response_html.split('\n')
        for i in tem:
            if i == '':
                continue
            try:
                ip_port = i.split(':')
                hig_ip_list.append((ip_port[0], ip_port[1].replace('\r', '')))
                q.put((ip_port[0], ip_port[1].replace('\r', '')))
            except:
                return []
    return hig_ip_list
Example #8
0
def craw_rmccurdy():
    """
    https://www.rmccurdy.com/scripts/proxy/good.txt
    :return:
    """
    url = 'https://www.rmccurdy.com/scripts/proxy/good.txt'
    response_html = get_text(url=url)
    rmccurdy_ip_list = []
    if response_html:
        tem = response_html.split('\n')
        for i in tem:
            if i == '' or i == ':':
                continue
            try:
                ip_port = i.split(':')
                proxy_type = 'http'
                rmccurdy_ip_list.append((proxy_type, ip_port[0], ip_port[1]))
                q.put((ip_port[0], ip_port[1].replace('\r', '')))
            except:
                return []
    return rmccurdy_ip_list
Example #9
0
def parse_text(file, level=2):
    print(file, level)
    raw_text = get_text(file, level=level)
    # print("raw text", "-"*80)
    print(raw_text)