def crawl_qy_dai_li(page_count=2): """ 获取旗云代理http://www.qydaili.com/free/?action=china&page=1 :param page_count: :return: """ start_url = "http://www.qydaili.com/free/?action=china&page={}" urls = [start_url.format(page) for page in range(1, page_count + 1)] qy_ip_list = [] for url in urls: headers = { "Host": "www.qydaili.com", "Upgrade-Insecure-Requests": "1", } response_html = get_text(url, options=headers) if response_html: tree = etree.HTML(response_html) tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for tr in tr_list: try: # proxy_type = tr.xpath("./td[4]/text()")[0].lower() ip = tr.xpath("./td[1]/text()")[0] port = tr.xpath("./td[2]/text()")[0] q.put((ip, port)) qy_ip_list.append((ip, port)) except: continue return qy_ip_list
def crawl_3366_dai_li(page_count=5, stype='1'): """ 获取云代理http://www.ip3366.net/free/?stype=1&page=3 :param page_count: :return: """ start_url = "http://www.ip3366.net/free/?stype%s=1&page={}" % stype urls = [start_url.format(page) for page in range(1, page_count + 1)] ip_3366_list = [] for url in urls: headers = { "Host": "www.ip3366.net", "Upgrade-Insecure-Requests": "1", } response_html = get_text(url, options=headers) if response_html: tree = etree.HTML(response_html) tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for tr in tr_list: # proxy_type = tr.xpath("./td[4]/text()")[0].lower() ip = tr.xpath("./td[1]/text()")[0] port = tr.xpath("./td[2]/text()")[0] q.put((ip, port)) ip_3366_list.append((ip, port)) return ip_3366_list
def crawl_66ip(page_count=2): """ 爬取http://www.66ip.cn/{}.html """ print(str(datetime.datetime.now()) + ' 开始爬取66ip......') start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] sixsix_ip_list = [] for url in urls: headers = { "Referer": url, "Host": "www.66ip.cn", "Upgrade-Insecure-Requests": "1", } response_html = get_text(url, options=headers) if response_html: pattern = re.compile( "<tr><td>(\d+.\d+.\d+.\d+)</td><td>(\d+)</td>", re.S) proxy_list = re.findall(pattern, response_html) for proxy in proxy_list: ip = proxy[0] port = proxy[1] ip = ip.strip() port = port.strip() q.put((ip, port)) sixsix_ip_list.append((ip, port)) return sixsix_ip_list
def crawl_89ip(page_count=2): """ 获取 http://www.89ip.cn/index.html 免费代理 :param page_count: :return: """ start_url = 'http://www.89ip.cn/index_{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] p89_ip_list = [] for url in urls: headers = { "Referer": url, "Host": "www.89ip.cn", "Upgrade-Insecure-Requests": "1", } response_html = get_text(url, options=headers) if response_html: tree = etree.HTML(response_html) tr_list = tree.xpath('//table[@class="layui-table"]/tbody/tr') for tr in tr_list: ip = tr.xpath("./td[1]/text()")[0].replace('\n', '').replace('\t', '') port = tr.xpath("./td[2]/text()")[0].replace('\n', '').replace('\t', '') q.put((ip, port)) p89_ip_list.append((ip, port)) return p89_ip_list
def get_word_frequency(project=""): wfrq = defaultdict(int) for file in walk_dir(project): raw_text = get_text(file, get_all=True) clear_text = clean_text(raw_text) words = parse_words(clear_text) for word in words: wfrq[word.lower()] += 1 proj_name = get_project_name(project) data = simplejson.dumps(wfrq, indent=4, item_sort_key=lambda i: (-i[1], i[0])) with open(freq_get_file_name(proj_name), "w") as f: f.write(data) return wfrq
def get_all_words(file, level=2): """ :param file: the target file to read :param level: return word level, 2: code logging and error message 3: 2 and code comment 4: 3 and project document, markdown file etc. :return: list of words """ raw_text = get_text(file, level=level) if not raw_text: return # print(file) # print("raw text:", raw_text) clear_text = clean_text(raw_text) # print("clean text:", clear_text) words = parse_words(clear_text) # print("words:", words) return words
def crawl_highanon(): """ http://www.proxylists.net/http_highanon.txt :return: """ url = 'http://www.proxylists.net/http_highanon.txt' response_html = get_text(url=url) hig_ip_list = [] if response_html: tem = response_html.split('\n') for i in tem: if i == '': continue try: ip_port = i.split(':') hig_ip_list.append((ip_port[0], ip_port[1].replace('\r', ''))) q.put((ip_port[0], ip_port[1].replace('\r', ''))) except: return [] return hig_ip_list
def craw_rmccurdy(): """ https://www.rmccurdy.com/scripts/proxy/good.txt :return: """ url = 'https://www.rmccurdy.com/scripts/proxy/good.txt' response_html = get_text(url=url) rmccurdy_ip_list = [] if response_html: tem = response_html.split('\n') for i in tem: if i == '' or i == ':': continue try: ip_port = i.split(':') proxy_type = 'http' rmccurdy_ip_list.append((proxy_type, ip_port[0], ip_port[1])) q.put((ip_port[0], ip_port[1].replace('\r', ''))) except: return [] return rmccurdy_ip_list
def parse_text(file, level=2): print(file, level) raw_text = get_text(file, level=level) # print("raw text", "-"*80) print(raw_text)