def download(self,unfinished_item,need_mark_err=True): #对于单条item的下载处理 #print(unfinished_item) try: url = unfinished_item[0] google_id = unfinished_item[1] #若查询到该文件名不存在,即没有被下载,则执行下载 save_name = google_id + '.pdf' if save_name not in self.download_folder_files: resp = requests.get( url=url, verify=False, headers={'User-agent':get_one_random_ua()} ) with open( os.path.join(self.save_folder, save_name), 'wb' ) as pdf_file: pdf_file.write(resp.content) file_kb = os.path.getsize(self.save_folder+save_name)*1.0 / 1024 if file_kb > 3: self.download_folder_files.append(save_name) print('Downloader:\n\t'+ save_name + '( {} Kb ) wrote ok...'.format(file_kb)) else: raise Exception('File size = {}k < 10k'.format(file_kb)) #下载完成后,在数据库中做记录:已下载 self.mark(google_id,ok=True) return True except Exception as e: if need_mark_err: self.mark(google_id,ok=False,err=e) print(str(e)) return False
def request_with_random_ua(url, timeout=3): for i in range(6): try: return requests.get(url=url, timeout=timeout, headers={'User-Agent': get_one_random_ua()}) except Exception as e: print('[Error]request_with_random_ua :%s' % str(e)) return None
def test_port(port_num): proxies = { "http": "socks5://127.0.0.1:{}".format(port_num), "https": "socks5://127.0.0.1:{}".format(port_num) } try: r = requests.get(url="https://api.ipify.org/", proxies=proxies, timeout=10, headers={'User-Agent': get_one_random_ua()}) return r.text except: return None
def request_with_proxy(url, timeout=14, use_ss=False, sleep=15, no_proxy_test=False): headers = {'User-Agent': get_one_random_ua()} if no_proxy_test: return requests.get(url, headers=headers, timeout=timeout) time.sleep(sleep) if not use_ss: ''' for i in range(100): proxy_port = rand_port(9054, 9155, []) if test_port(proxy_port): #检测端口有效再request break if i==20: print('No available port...check tor') return None ''' proxy_port = rand_port(9054, 9155, []) proxies = { "http": "socks5://127.0.0.1:{}".format(proxy_port), "https": "socks5://127.0.0.1:{}".format(proxy_port) } return requests.get(url, proxies=proxies, headers=headers, timeout=timeout, verify=False) else: #port_range = (1080, 1108) error_ports = [1094, 1098] port = rand_port(1080, 1108, error_ports) proxies = { "http": "socks5://127.0.0.1:{}".format(port), "https": "socks5://127.0.0.1:{}".format(port) } return requests.get(url, proxies=proxies, timeout=timeout, headers=headers, verify=False)
def request_with_proxy(url, gap_time=15, timeout=14, use_ss=False, no_proxy_test=False, use_self_pool=False): headers = {'User-Agent': get_one_random_ua()} if use_self_pool: return req_with_proxy_pool(url, headers) if no_proxy_test: return requests.get(url, headers=headers, timeout=timeout) time.sleep(gap_time) if not use_ss: proxy_port = rand_port(9054, 10055, []) #print('use port {}...'.format(proxy_port)) proxies = { "http": "socks5://127.0.0.1:{}".format(proxy_port), "https": "socks5://127.0.0.1:{}".format(proxy_port) } return requests.get(url, proxies=proxies, headers=headers, timeout=timeout, verify=False) else: #port_range = (1080, 1108) error_ports = [1094, 1098] port = rand_port(1080, 1108, error_ports) proxies = { "http": "socks5://127.0.0.1:{}".format(port), "https": "socks5://127.0.0.1:{}".format(port) } return requests.get(url, proxies=proxies, timeout=timeout, headers=headers, verify=False)
def request_with_random_ua(url, timeout=3): return requests.get(url=url, timeout=timeout, headers={'User-Agent': get_one_random_ua()})