def craw_ips_by_page(self, page): url = "https://www.xicidaili.com/nn/%s" % page downloader = download.Downloader() html_content = downloader.requests_get(url, "html") # 若返回 “0”, 可能是本机 ip 被封禁,尝试使用备用代理 if html_content == "0": f = open("ips_copy.json", "r") ips = json.loads(f.read()) for ip in ips: proxy_temp = {"http": "http://%s:%s" % (ip['ip'], ip['port'])} print("本机ip不可用, 尝试 http://%s:%s" % (ip['ip'], ip['port'])) try: res = requests.get(url, timeout=1, proxies=proxy_temp) if res.status_code == 200: html_content = res.content.decode("utf-8", "ignore") break except: continue soup = BeautifulSoup(html_content, 'html.parser') all_trs = soup.find("table", id="ip_list").find_all('tr') for tr in all_trs[1:]: tds = tr.find_all("td") ip = { 'ip': tds[1].get_text(), 'port': tds[2].get_text(), 'type': tds[5].get_text() } # ip = tds[1].get_text() # 检查 ip 是否可用 if self.check_ip(ip): self.ip_pool.append(ip) if len(self.ip_pool) >= self.max_ip_num: break
def getData(year, file, toDir): urls = [] file = open(file) line = file.readline() while line: if year in line: print(line) urls.append(line) line = file.readline() file.close() # 遍历urls下载 for url in urls: print("——————————————————————————————————————————————") download.Downloader(toDir).donwnloading(url)
def __init__(self, root, data, threadNum): self.root = root self.threadNum = threadNum self.downloader = download.Downloader() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20' } self.s_list = [] self.links = [] # self.total_count = len(data) # self.start_time = time.time() # sizex, sizey = getTerminalSize() # self.width = sizex # self.height = sizey for line in data: #print line self.links.append(line.strip()) self.work()
def download_task(q): # Create a new downloader instance dl = download.Downloader() last_update = None while True: if last_update == hash(q): time.sleep(5) continue last_update = hash(q) # Get the next song that hasn't been downloaded song = q.get_next_song(only_undownloaded=True) if song is None: time.sleep(5) continue # Download the song and store the filename filename = dl.download(song["youtube_id"]) # Update the database with the filename q.update_song(song["id"], filename=filename)
def __init__(self, root, data, threadNum): self.root = root if not self.root: print 'not url' self.threadNum = threadNum self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' } self.task = Queue.Queue() self.s_list = [] self.downloader = download.Downloader() self.total_count = len(data) self.start_time = time.time() sizex, sizey = getTerminalSize() self.width = sizex self.height = sizey for line in data: #print line self.task.put(line.strip()) self.remaining_count = self.task.qsize() self.work()
def main(): template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}' countries = set() downloader = download.Downloader(mongo_cache.MongoCache()) for letter in string.lowercase: page = 0 while True: html = downloader(template_url.format(page, letter)) print html try: ajax = json.loads(html) except ValueError as e: print e ajax = None else: for record in ajax['records']: countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break open('countries.txt', 'w').write('\n'.join(sorted(countries)))
def direct_download(url): downloader = download.Downloader() return downloader(url)
# Fix includes. dst.write( re.sub(r'# *include "(format.h|posix.h)"', r'#include "mp/\1"', line)) def extract(archive, filenames, dest, archive_dir, **kwargs): dest = os.path.join(project_dir, dest) if kwargs.get('clean'): fileutil.rmtree_if_exists(dest) os.mkdir(dest) for filename in filenames: dest_path = os.path.join(dest, filename) if filename.endswith('/'): if not os.path.exists(dest_path): os.mkdir(dest_path) continue with archive.open(archive_dir + filename) as src: with open(dest_path, 'w') as dst: copyfile(src, dst) d = download.Downloader() with d.download( 'https://github.com/cppformat/cppformat/archive/master.zip') as f: with zipfile.ZipFile(f, 'r') as zf: root = 'cppformat-master/' extract(zf, include_files, 'include/mp', root) extract(zf, src_files, 'src', root) extract(zf, test_files, 'test', root + 'test/')
def main(bucket, filter, error, no_dl): if not no_dl: dl = download.Downloader(bucket, filter) dl.run() process.Processor(error).run()