def genproxy(number: int): collector = ps.get_collector(name='ProxyGen') collector.refresh_proxies() proxy_list = [] for _ in range(number): proxy = collector.get_proxy({'anonymous': True}) proxy_list.append(proxy) collector.remove_proxy(proxy) return proxy_list
def _get_free_proxies_collector(): """Retrieve or create a Collector of free proxies. :return: Collector object """ try: collector = get_collector('scraping-proxies') except CollectorNotFoundError: collector = create_collector('scraping-proxies', ['socks4', 'socks5']) return collector
def proxyscrape_lib(self) -> Set[str]: """Parsing proxies from proxyscrape py library""" free_proxies = scrapers.get_free_proxy_list_proxies() ssl_proxies = scrapers.get_ssl_proxies() try: collector = create_collector("default", "http") except CollectorAlreadyDefinedError: collector = get_collector("default") collector_proxies = set(collector.get_proxies()) proxies = free_proxies | ssl_proxies | collector_proxies for proxy in proxies: prepare_proxy = f"{proxy.host}:{proxy.port}" if prepare_proxy not in self.proxy_set: self.proxy_set.add(prepare_proxy) logger.info( f"From proxyscrape_lib were parsed {len(self.proxy_set)} proxies") return self.proxy_set
def get_proxies(desired_amount: int = 1, proxy_timeout=0.5): proxies = [] # https://stackoverflow.com/a/59531141 try: collector_1 = proxyscrape.get_collector('collector-http') except proxyscrape.errors.CollectorNotFoundError: collector_1 = proxyscrape.create_collector('collector-http', 'http') full_list = list(collector_1.get_proxies()) for item in full_list: proxies.append(item.host + ':' + item.port) print(bs.warning_o(bsl["PROXY"]["FOUND"]), bs.red_o(str(len(proxies))), bs.warning_o(bsl["PROXY"]["HTTP_PROXIES"])) print(bs.warning_o(bsl["PROXY"]["PERFORMANCE_CHECK"])) print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"])) time.sleep(1) bs.clear() start_time = time.time() cnt = 0 print( bs.warning_o(bsl["PROXY"]["CHECKED"]) + bs.red_o(' 0 ') + bs.warning_o(bsl["PROXY"]["OUT_OF"]), bs.red_o(str(len(proxies))), bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]), bs.red_o(str(proxy_timeout)), bs.warning_o(bsl["PROXY"]["SECONDS_3"])) print(bs.warning_o(bsl["PROXY"]["CHOSEN"]), bs.red_o(str(cnt)), bs.warning_o(bsl["PROXY"]["OUT_OF"]), bs.red_o(str(desired_amount)), bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]), bs.red_o(str(proxy_timeout)), bs.warning_o(bsl["PROXY"]["SECONDS_3"])) print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"])) checked_proxy = [] for ind, item in enumerate(proxies, start=1): if cnt < desired_amount: if bs.is_bad_proxy(item, proxy_timeout): print('[BAD PROXY]') else: checked_proxy.append(item) cnt += 1 else: break bs.clear() print(bs.warning_o(bsl["PROXY"]["CHECKED"]), bs.red_o(str(ind)), bs.warning_o(bsl["PROXY"]["OUT_OF"]), bs.red_o(str(len(proxies))), bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]), bs.red_o(str(proxy_timeout)), bs.warning_o(bsl["PROXY"]["SECONDS_3"])) print(bs.warning_o(bsl["PROXY"]["CHOSEN"]), bs.okgreen_o(str(cnt)), bs.warning_o(bsl["PROXY"]["OUT_OF"]), bs.red_o(str(desired_amount)), bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]), bs.red_o(str(proxy_timeout)), bs.warning_o(bsl["PROXY"]["SECONDS_3"])) print(bs.warning_o(bsl["PROXY"]["EXIT_WARN"])) print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"])) end_time = time.time() extra_message = (bsl["PROXY"]["APPENDED"], str(cnt), bsl["PROXY"]["PROXIES_WITH_TIMEOUT"], str(proxy_timeout), bsl["PROXY"]["SECONDS_1"], bsl["PROXY"]["TO_THE_PROXY_LIST"], bsl["PROXY"]["IN"], str(round(end_time - start_time, 2)), bsl["PROXY"]["SECONDS_1"] + ']') extra_message = bs.success_o(' '.join(x for x in extra_message)) with open('proxy_list.txt', 'a') as infl: for item in checked_proxy: infl.write(''.join(item) + '\n') bs.print_full_main_screen(extra_message)
def process_page(url, path='', find_pages=False, page_size=50): import os import re import csv import math import time import requests from bs4 import BeautifulSoup from proxyscrape import create_collector, get_collector # Processes in multiprocessing have separate contexts # You can't use information from outside the method class Page(): status_code = 0 # Returns the number of pages in the full search def how_many_pages(url, http, http_collector, page_size): page = get_page(url, http, http_collector) content = BeautifulSoup(page.content, 'lxml') result_count = content.find('span', class_='result__count').text result_count = result_count.split(' ')[0].replace(',', '') pages = math.ceil(int(result_count) / page_size) return pages def make_collector(page_i=''): http_collector = create_collector(f'http-collector-{page_i}', 'https') return http_collector # Returns requests session with proxies (http, https) def setup_new_proxies(http_collector, http): proxy_http = http_collector.get_proxy() proxy_https = http_collector.get_proxy({'type': 'https'}) http.proxies = { 'http': f'http://{proxy_http.host}:{proxy_http.port}', 'https': f'https://{proxy_https.host}:{proxy_https.port}' } return http def create_new_session(http_collector): http = requests.Session() http = setup_new_proxies(http_collector, http) return http # Returns index of page from url def get_page_i(url): if 'startPage' in url: page_i = next(re.finditer(r'\d+$', url)).group(0) page_i = int(page_i) + 1 else: page_i = 1 if page_i < 10: page_i = '0' + str(page_i) return page_i # Returns page from url def get_page(url, http, http_collector): page = Page() start = time.time() while True: try: page = http.get(url, timeout=6) if page.status_code == 200: break except BaseException as error: pass finally: print('CHANGE PROXY PAGE: ', page.status_code, http.proxies['https']) if page.status_code != 200: http = setup_new_proxies(http_collector, http) end = time.time() page_i = get_page_i(url) print(f'R Page {page_i}: ', end - start) return page def process_result(result, http, http_collector): def get_full_result(DOI, http, http_collector): full_page = Page() while True: try: full_page = http.get('https://dl.acm.org' + DOI, timeout=10) if full_page.status_code == 200: break except BaseException as error: pass finally: print('FULL PAGE: ', full_page.status_code, http.proxies['https']) if full_page.status_code != 200: http = setup_new_proxies(http_collector, http) return full_page def parse_full_result(full_result): full_result_parsed = BeautifulSoup(full_result.content, 'lxml') authors = full_result_parsed.find_all('span', class_='loa__author-name') authors_processed = set() for author in authors: authors_processed.add( author.text.encode('ascii', 'namereplace').decode()) authors_string = ', '.join(list(authors_processed)) abstract = full_result_parsed.find( 'div', class_='abstractSection abstractInFull') if abstract != None: abstract_processed = abstract.text.encode( 'ascii', 'namereplace').decode() else: abstract_processed = 'No abstract available.' full_result_parsed = { 'Type' : type_result, \ 'Title': title_processed, 'DOI': DOI, 'Authors' : authors_string, \ 'Abstract': abstract_processed} return full_result_parsed type_result = result.find('div', class_='issue-heading').text.lower() box = result.find('h5', class_='issue-item__title') title_processed = box.text.encode('ascii', 'namereplace').decode() DOI = box.find('a')['href'] full_result = get_full_result(DOI, http, http_collector) full_result_processed = parse_full_result(full_result) return full_result_processed def get_results_list(page, http, http_collector): start = time.time() content = BeautifulSoup(page.content, 'lxml') results = content.find_all('li', class_='search__item issue-item-container') results_list = [ process_result(result, http, http_collector) for result in results ] end = time.time() print(f'Result Processing {page_i}: ', end - start) return results_list def write_csv(results_list, path): if not os.path.exists(f'{path}\\results'): os.mkdir(f'{path}\\results') if len(results_list) > 0: #with open('.\\results\\Page_{}.csv'.format(page_i), 'w', newline='') as csvfile: with open(f'{path}\\results\\Page_{page_i}.csv'.format(page_i), 'w', newline='') as csvfile: w = csv.DictWriter(csvfile, results_list[0].keys(), extrasaction='ignore') w.writeheader() w.writerows(results_list) page_i = get_page_i(url) http_collector = None try: http_collector = get_collector('http-collector-01') except: pass if http_collector == None: http_collector = make_collector(page_i) http = create_new_session(http_collector) if find_pages: return how_many_pages(url, http, http_collector, page_size) page = get_page(url, http, http_collector) results_list = get_results_list(page, http, http_collector) write_csv(results_list, path) return
def get_proxy_list(): collector = get_collector('list-collector') proxie_collection = collector.get_proxies({'anonymous': True}) return list(i.host + ':' + i.port for i in proxie_collection)