class TranslatePO(object): def __init__(self, dest: str = 'en', src: str = 'auto', proxy_size=20, th_size=0): self.dest = dest self.src = src self.proxies = Proxies(proxy_len=proxy_size) self.proxies.verify_proxies() self.loading_index = 0 self.loading_len = 0 self.th_size = th_size def __progress(self): self.loading_index += 1 progress = self.loading_index * 100 // self.loading_len print('Loading: %s%%%s' % (progress, " " * 10), end='\r') def __translate(self, line): self.__progress() curr_proxy = self.proxies.done_proxies.get() tr = Translator(proxies=curr_proxy, timeout=10) res = None try: res = tr.translate(line.msgid, dest=self.dest, src=self.src) except Exception: curr_proxy = self.proxies.done_proxies.get() tr = Translator(proxies=curr_proxy, timeout=10) res = tr.translate(line.msgid, dest=self.dest, src=self.src) finally: self.proxies.done_proxies.put(curr_proxy) line.msgstr = res.text if res else '' def po_translate(self, file: str, out='res.po'): po_file = pofile(file) self.loading_len = len(po_file) if self.th_size: th = ThreadPool(self.th_size, self.__translate, data=po_file) th.start() th.join() else: for el in po_file: self.__translate(el) print('\nFINISH') with open(out, 'w') as out_file: for po_line in po_file: out_file.write('%s\n' % str(po_line))
def __init__(self, dest: str = 'en', src: str = 'auto', proxy_size=20, th_size=0): self.dest = dest self.src = src self.proxies = Proxies(proxy_len=proxy_size) self.proxies.verify_proxies() self.loading_index = 0 self.loading_len = 0 self.th_size = th_size
def start(key, sort_type): reatefile(key) proxies = Proxies().main() # proxies={} for page_num in range(1, 15): try: url_list = search(key, sort_type, page_num, proxies) # print('****',url_list) for url in url_list: link(key, url, proxies) time.sleep(random.uniform(1, 3)) except Exception as e: print('* error *', e) if len(url_list) == 0: proxies = Proxies().main()
def get_source_code_submissions(proxies, proxy_index, contest_id, submission_ids, pbar, submission_index=0, iterations=1000): if iterations > 0: # Specify a random user agent headers = {'User-Agent': 'anything'} # if all proxies have been exhausted, get new list of proxies if proxy_index >= len(proxies): proxies = Proxies.get_proxies() proxy_index = 0 # Set the proxy for the requests proxy_dict = { 'http': f'{proxies[proxy_index]}', 'https': f'{proxies[proxy_index]}', } try: # Go over all submissions and make a request to the submission page for submission in submission_ids[submission_index:]: response = requests.get( f'https://codeforces.com/contest/{contest_id}/submission/{submission}', timeout=10, headers=headers, proxies=proxy_dict, allow_redirects=False) # Transform to soup object with html parser soup = BeautifulSoup(response.content, 'html.parser') # Find the actual program code on page solution = soup.find("pre", { "id": "program-source-text" }).text.strip() os.makedirs(f'data/contests_solutions_code/{contest_id}/', exist_ok=True) text_file = open( f'data/contests_solutions_code/{contest_id}/{submission}.txt', "w") text_file.write(solution) text_file.close() # update where we are for if we need to update the proxy and update the progress bar submission_index += 1 pbar.update() except Exception: proxy_index += 1 iterations -= 1 Scraper.get_source_code_submissions(proxies, proxy_index, contest_id, submission_ids, pbar, submission_index, iterations)
class HttpProxyMiddleware(object): def __init__(self, proxy_file, proxy_bypass_percent, **kwargs): self.bypass_percent = int(proxy_bypass_percent) self.proxies = Proxies(proxy_file, **kwargs) @classmethod def from_crawler(cls, crawler): return cls( crawler.settings.get( 'PROXY_SERVER_LIST_CACHE_FILE', 'proxies.txt' ), crawler.settings.get( 'PROXY_BYPASS_PERCENT', 0 ), logger=lambda message: log.msg(message), ) def process_request(self, request, spider): n = random.randint(0, 99) if n >= self.bypass_percent: proxy = self.proxies.get_proxy() log.msg('Using proxy ' + proxy, spider=spider) request.meta['proxy'] = 'http://' + proxy else: if 'proxy' in request.meta: del request.meta['proxy'] log.msg('No proxy used', spider=spider)
def main(): loop = asyncio.get_event_loop() with aiohttp.ClientSession(loop=loop) as session: proxies = Proxies( session, 30, 'http://gimmeproxy.com/api/getProxy?protocol=http&supportsHttps=true&maxCheckPeriod=3600' ) area = Area('https://www.freecycle.org/browse/UK/London') groups = loop.run_until_complete( area.get_groups(session, proxies, SEARCH_TERMS, FROM, TO)) display(groups)
class HttpProxyMiddleware(object): def __init__(self, cache_file): if (cache_file is None): cache_file = '/tmp/__proxy_list_cache.json' self.proxies = Proxies(cache_file) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings['PROXY_SERVER_LIST_CACHE_FILE']) def process_request(self, request, spider): proxy = self.proxies.getProxy() print "Using proxy %s"%proxy request.meta['proxy'] = "http://%s" % proxy
class HttpProxyMiddleware(object): def __init__(self, cache_file): if (cache_file is None): cache_file = '/tmp/__proxy_list_cache.json' self.proxies = Proxies(cache_file) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings['PROXY_SERVER_LIST_CACHE_FILE']) def process_request(self, request, spider): proxy = self.proxies.getProxy() print "Using proxy %s" % proxy request.meta['proxy'] = "http://%s" % proxy
class Requests: def __init__(self): self.proxies = Proxies() def request(self, method, url, **kwargs): exception = None TRIES = 10 for i in range(TRIES): try: with self.proxies.borrow() as proxy: kwargs['proxies'] = dict(http='http://%s:%s' % (proxy.host, proxy.port), https='https://%s:%s' % (proxy.host, proxy.port)) kwargs['timeout'] = (10, 20) response = requests.request(method=method, url=url, **kwargs) if response.headers['content-type'] == 'application/json': response.json() # avoiding incorrect json even if 200 return response except Exception as e: log.warning('Got exception %s try(%s/%s)' % (e, i + 1, TRIES)) exception = e raise exception def get(self, url, params=None, **kwargs): kwargs.setdefault('allow_redirects', True) return self.request('get', url, params=params, **kwargs) def options(self, url, **kwargs): kwargs.setdefault('allow_redirects', True) return self.request('options', url, **kwargs) def head(self, url, **kwargs): kwargs.setdefault('allow_redirects', False) return self.request('head', url, **kwargs) def post(self, url, data=None, json=None, **kwargs): return self.request('post', url, data=data, json=json, **kwargs) def put(self, url, data=None, **kwargs): return self.request('put', url, data=data, **kwargs) def patch(self, url, data=None, **kwargs): return self.request('patch', url, data=data, **kwargs) def delete(self, url, **kwargs): return self.request('delete', url, **kwargs)
def baiduzhidaosearch(keyword, page): ret = { 'code': 1002, 'msg': 'failure', 'data': [] } try: page = int(page) * 10 print 111 keyword_u = keyword.encode('utf-8') print chardet.detect(keyword_u) # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page) url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword) #print(url) print 222 headers = Headers.getHeaders() proxies = Proxies.get_proxies() req = requests.get(url, headers=headers, timeout=60, proxies=proxies) if req.status_code == 200: ret['code'] = 1001 ret['msg'] = 'success' id = [] title = [] req.encoding = 'gbk' html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore") selector = etree.HTML(html) urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href') for u in urls: match_obj = re.search(r'question/(.*?).html', u, re.M | re.I) id.append(match_obj.group(1)) titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a') for t in titles: title.append(etree.tostring(t, encoding='utf8', method="html")) max_n = len(id) n = 0 while True: if n >= max_n: break # print(title[n]) ret['data'].append( {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)}) n = n + 1 except Exception as e : print e return simplejson.dumps(ret)
def main(): if len(sys.argv) < 2: print("Usage: " + sys.argv[0] + " <board>") return board = sys.argv[1] posts = {} try: posts = json.loads(open("data/posts_%s.json" % board).read()) except Exception as e: pass regexps_like = [ regex.split("\n")[0] for regex in open("data/regexps_like").readlines() ] regexps_dislike = [ regex.split("\n")[0] for regex in open("data/regexps_dislike").readlines() ] comparator_dislike = Comparator( open("data/comparator.wasm", "rb").read(), [ base64.b64decode(image.split("\n")[0]) for image in open("data/images").readlines() ]) checker = Checker(regexps_like, regexps_dislike, comparator_dislike) proxies = Proxies( [proxy.split("\n")[0] for proxy in open("data/proxies").readlines()]) network = Network(proxies, 10) liker = Liker(board, checker, posts, network) network.start() network.join()
def __init__(self, proxy_file, proxy_bypass_percent, **kwargs): self.bypass_percent = int(proxy_bypass_percent) self.proxies = Proxies(proxy_file, **kwargs)
def __init__(self): self.proxies = Proxies()
from headers import Headers from config import Config from mysqldao import MysqlDao import time from proxies import Proxies url_main = 'http://www.bttiantang.com/movie.php?/order/id/' page_count = 10 # 1367为总页数,需要定期修改 for one in xrange(1, page_count): url = url_main + str(one) print(url) headers = Headers.getHeaders() try: proxies = Proxies.get_proxies() req = requests.get(url, headers=headers, timeout=30, proxies=proxies) code = req.status_code print(code) if code == 200: html = req.content selector = etree.HTML(html) content_urls = selector.xpath('//*[contains(@class,"litpic")]/a[1]/@href') for content_url in content_urls: content_url = Config.url_main + content_url created_at = time.strftime('%Y-%m-%d %H:%M:%S') insert_value = '"' + content_url + '",0,"' + created_at + '"' mysqlDao = MysqlDao() sql = 'select url from bttiantang_url ORDER BY id desc limit 0,1 ' ret = mysqlDao.execute(sql) for r in ret:
#!/usr/bin/env python3 """""" from logger import Logger from proxies import Proxies from scraper import Scraper from spoofing import Headers if __name__ == "__main__": headers = Headers() logger = Logger() proxies = Proxies(headers, logger) scraper = Scraper(headers, proxies, logger) scraper.scrape()
def __init__(self, cache_file): if (cache_file is None): cache_file = '/tmp/__proxy_list_cache.json' self.proxies = Proxies(cache_file)
run = True task = 0 while run: task_n = int(show_task()) try: task = task_n except ValueError as e: print('请输入正确的序号') continue if task == 0: run = False elif task == 1: create_db() elif task == 2: Proxies().run() elif task == 3: while os.path.exists( os.path.abspath(os.path.dirname(__file__)) + '\data'): shutil.rmtree( os.path.abspath(os.path.dirname(__file__)) + '\data') WeChatSogou().run_hy() elif task == 4: names = input(''' ** 请输入要爬取的公众号:(可输入多个) ''').strip('"') names = [name for name in names.split(',')] proxy = input(''' ** 选择手动输入代理: (ip:port(目前只接收一个)|自动请回车) ''') WeChatSogou().run_gzh(names=names, iproxy=proxy)
import datetime import time from combiner import CSVCombiner from downloader import CanWeatherDataDownloader from proxies import Proxies st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H%M%S') dldr = CanWeatherDataDownloader() dldr.set_download_dir = './download/' + st + '/' dldr.set_proxies(Proxies.get_random_proxy()) dldr.all_station_id = dldr.read_station_id_from_file('./station_id.txt') dldr.download_daily_data(1990, 2020, thread=20) csv_combiner = CSVCombiner(dldr.download_dir, 'weather_data.csv').run()
def get_solutions(amt_threads=40): # Get all contests that have already been scraped scraped_contests = [] for _, dirs, _ in os.walk('data/contests_solutions_code'): scraped_contests += [int(dir) for dir in dirs] # Get the metadata files to find submission ids to scrape solutions for metadata_files = [ file for file in os.listdir('data/contests_solutions_metadata') if os.path.isfile(f'data/contests_solutions_metadata/{file}') ] if len(metadata_files) == 0: print( 'Run "get_solutions_metadata" first to fetch the solutions to scrape before running this function' ) for metadata_file in metadata_files: # Open the solutions metadata file where solutions will be coupled to solutions_df = pd.read_csv( f'data/contests_solutions_metadata/{metadata_file}') # Get all unique contests in this dataset that are not scraped yet contests = list( solutions_df[~solutions_df['contestId'].isin(scraped_contests)] ['contestId'].unique()) for contest in contests: print(f'starting with contest {contest}...') contest_submissions = solutions_df[solutions_df['contestId'] == contest]['solutionId'] # Work with amt of threads to parallelize the requests threads = [] submissions_per_thread = math.ceil( len(contest_submissions) / amt_threads) proxies = Proxies.get_proxies() # Progress bar to indicate current progress and speed pbar = tqdm(total=len(contest_submissions)) for index in range(0, len(contest_submissions), submissions_per_thread): # Let every thread start with a random proxy to spread the search space proxy_index = random.randrange(0, len(proxies)) # Get solutions for the contests from the scraper threads.append( threading.Thread( target=Scraper.get_source_code_submissions, args=( proxies, proxy_index, contest, contest_submissions[index:index + submissions_per_thread], pbar, ))) for t in threads: t.start() for t in threads: t.join()