def genproxy(number: int):
    collector = ps.get_collector(name='ProxyGen')
    collector.refresh_proxies()
    proxy_list = []
    for _ in range(number):
        proxy = collector.get_proxy({'anonymous': True})
        proxy_list.append(proxy)
        collector.remove_proxy(proxy)
    return proxy_list
    def _get_free_proxies_collector():
        """Retrieve or create a Collector of free proxies.

        :return: Collector object
        """
        try:
            collector = get_collector('scraping-proxies')
        except CollectorNotFoundError:
            collector = create_collector('scraping-proxies',
                                         ['socks4', 'socks5'])

        return collector
Exemple #3
0
    def proxyscrape_lib(self) -> Set[str]:
        """Parsing proxies from proxyscrape py library"""
        free_proxies = scrapers.get_free_proxy_list_proxies()
        ssl_proxies = scrapers.get_ssl_proxies()
        try:
            collector = create_collector("default", "http")
        except CollectorAlreadyDefinedError:
            collector = get_collector("default")
        collector_proxies = set(collector.get_proxies())
        proxies = free_proxies | ssl_proxies | collector_proxies

        for proxy in proxies:
            prepare_proxy = f"{proxy.host}:{proxy.port}"
            if prepare_proxy not in self.proxy_set:
                self.proxy_set.add(prepare_proxy)
        logger.info(
            f"From proxyscrape_lib were parsed {len(self.proxy_set)} proxies")
        return self.proxy_set
Exemple #4
0
    def get_proxies(desired_amount: int = 1, proxy_timeout=0.5):

        proxies = []

        # https://stackoverflow.com/a/59531141
        try:
            collector_1 = proxyscrape.get_collector('collector-http')

        except proxyscrape.errors.CollectorNotFoundError:
            collector_1 = proxyscrape.create_collector('collector-http',
                                                       'http')

        full_list = list(collector_1.get_proxies())

        for item in full_list:
            proxies.append(item.host + ':' + item.port)

        print(bs.warning_o(bsl["PROXY"]["FOUND"]), bs.red_o(str(len(proxies))),
              bs.warning_o(bsl["PROXY"]["HTTP_PROXIES"]))

        print(bs.warning_o(bsl["PROXY"]["PERFORMANCE_CHECK"]))
        print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"]))

        time.sleep(1)
        bs.clear()
        start_time = time.time()

        cnt = 0

        print(
            bs.warning_o(bsl["PROXY"]["CHECKED"]) + bs.red_o(' 0 ') +
            bs.warning_o(bsl["PROXY"]["OUT_OF"]), bs.red_o(str(len(proxies))),
            bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]),
            bs.red_o(str(proxy_timeout)),
            bs.warning_o(bsl["PROXY"]["SECONDS_3"]))

        print(bs.warning_o(bsl["PROXY"]["CHOSEN"]), bs.red_o(str(cnt)),
              bs.warning_o(bsl["PROXY"]["OUT_OF"]),
              bs.red_o(str(desired_amount)),
              bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]),
              bs.red_o(str(proxy_timeout)),
              bs.warning_o(bsl["PROXY"]["SECONDS_3"]))

        print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"]))

        checked_proxy = []

        for ind, item in enumerate(proxies, start=1):

            if cnt < desired_amount:

                if bs.is_bad_proxy(item, proxy_timeout):
                    print('[BAD PROXY]')
                else:
                    checked_proxy.append(item)
                    cnt += 1
            else:
                break

            bs.clear()

            print(bs.warning_o(bsl["PROXY"]["CHECKED"]), bs.red_o(str(ind)),
                  bs.warning_o(bsl["PROXY"]["OUT_OF"]),
                  bs.red_o(str(len(proxies))),
                  bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]),
                  bs.red_o(str(proxy_timeout)),
                  bs.warning_o(bsl["PROXY"]["SECONDS_3"]))

            print(bs.warning_o(bsl["PROXY"]["CHOSEN"]), bs.okgreen_o(str(cnt)),
                  bs.warning_o(bsl["PROXY"]["OUT_OF"]),
                  bs.red_o(str(desired_amount)),
                  bs.warning_o(bsl["PROXY"]["PROXIES_WITH_TIMEOUT"]),
                  bs.red_o(str(proxy_timeout)),
                  bs.warning_o(bsl["PROXY"]["SECONDS_3"]))

            print(bs.warning_o(bsl["PROXY"]["EXIT_WARN"]))
            print(bs.warning_o(bsl["GENERAL"]["CTRL_Z_EXIT"]))

        end_time = time.time()

        extra_message = (bsl["PROXY"]["APPENDED"], str(cnt),
                         bsl["PROXY"]["PROXIES_WITH_TIMEOUT"],
                         str(proxy_timeout), bsl["PROXY"]["SECONDS_1"],
                         bsl["PROXY"]["TO_THE_PROXY_LIST"], bsl["PROXY"]["IN"],
                         str(round(end_time - start_time,
                                   2)), bsl["PROXY"]["SECONDS_1"] + ']')

        extra_message = bs.success_o(' '.join(x for x in extra_message))

        with open('proxy_list.txt', 'a') as infl:
            for item in checked_proxy:
                infl.write(''.join(item) + '\n')

        bs.print_full_main_screen(extra_message)
def process_page(url, path='', find_pages=False, page_size=50):
    import os
    import re
    import csv
    import math
    import time
    import requests
    from bs4 import BeautifulSoup
    from proxyscrape import create_collector, get_collector

    # Processes in multiprocessing have separate contexts
    # You can't use information from outside the method

    class Page():
        status_code = 0

    # Returns the number of pages in the full search
    def how_many_pages(url, http, http_collector, page_size):
        page = get_page(url, http, http_collector)
        content = BeautifulSoup(page.content, 'lxml')

        result_count = content.find('span', class_='result__count').text
        result_count = result_count.split(' ')[0].replace(',', '')

        pages = math.ceil(int(result_count) / page_size)
        return pages

    def make_collector(page_i=''):
        http_collector = create_collector(f'http-collector-{page_i}', 'https')
        return http_collector

    # Returns requests session with proxies (http, https)
    def setup_new_proxies(http_collector, http):
        proxy_http = http_collector.get_proxy()
        proxy_https = http_collector.get_proxy({'type': 'https'})
        http.proxies = {
            'http': f'http://{proxy_http.host}:{proxy_http.port}',
            'https': f'https://{proxy_https.host}:{proxy_https.port}'
        }
        return http

    def create_new_session(http_collector):
        http = requests.Session()
        http = setup_new_proxies(http_collector, http)
        return http

    # Returns index of page from url
    def get_page_i(url):
        if 'startPage' in url:
            page_i = next(re.finditer(r'\d+$', url)).group(0)
            page_i = int(page_i) + 1
        else:
            page_i = 1

        if page_i < 10:
            page_i = '0' + str(page_i)
        return page_i

    # Returns page from url
    def get_page(url, http, http_collector):
        page = Page()
        start = time.time()
        while True:
            try:
                page = http.get(url, timeout=6)
                if page.status_code == 200:
                    break
            except BaseException as error:
                pass
            finally:
                print('CHANGE PROXY PAGE: ', page.status_code,
                      http.proxies['https'])
                if page.status_code != 200:
                    http = setup_new_proxies(http_collector, http)
        end = time.time()
        page_i = get_page_i(url)
        print(f'R Page {page_i}: ', end - start)
        return page

    def process_result(result, http, http_collector):
        def get_full_result(DOI, http, http_collector):
            full_page = Page()
            while True:
                try:
                    full_page = http.get('https://dl.acm.org' + DOI,
                                         timeout=10)
                    if full_page.status_code == 200:
                        break
                except BaseException as error:
                    pass
                finally:
                    print('FULL PAGE: ', full_page.status_code,
                          http.proxies['https'])
                    if full_page.status_code != 200:
                        http = setup_new_proxies(http_collector, http)
            return full_page

        def parse_full_result(full_result):
            full_result_parsed = BeautifulSoup(full_result.content, 'lxml')

            authors = full_result_parsed.find_all('span',
                                                  class_='loa__author-name')
            authors_processed = set()
            for author in authors:
                authors_processed.add(
                    author.text.encode('ascii', 'namereplace').decode())
            authors_string = ', '.join(list(authors_processed))

            abstract = full_result_parsed.find(
                'div', class_='abstractSection abstractInFull')
            if abstract != None:
                abstract_processed = abstract.text.encode(
                    'ascii', 'namereplace').decode()
            else:
                abstract_processed = 'No abstract available.'

            full_result_parsed = { 'Type' : type_result, \
                'Title': title_processed, 'DOI': DOI, 'Authors' : authors_string, \
                    'Abstract': abstract_processed}
            return full_result_parsed

        type_result = result.find('div', class_='issue-heading').text.lower()

        box = result.find('h5', class_='issue-item__title')
        title_processed = box.text.encode('ascii', 'namereplace').decode()
        DOI = box.find('a')['href']

        full_result = get_full_result(DOI, http, http_collector)
        full_result_processed = parse_full_result(full_result)
        return full_result_processed

    def get_results_list(page, http, http_collector):
        start = time.time()
        content = BeautifulSoup(page.content, 'lxml')
        results = content.find_all('li',
                                   class_='search__item issue-item-container')
        results_list = [
            process_result(result, http, http_collector) for result in results
        ]
        end = time.time()
        print(f'Result Processing {page_i}: ', end - start)
        return results_list

    def write_csv(results_list, path):
        if not os.path.exists(f'{path}\\results'):
            os.mkdir(f'{path}\\results')

        if len(results_list) > 0:
            #with open('.\\results\\Page_{}.csv'.format(page_i), 'w', newline='') as csvfile:
            with open(f'{path}\\results\\Page_{page_i}.csv'.format(page_i),
                      'w',
                      newline='') as csvfile:
                w = csv.DictWriter(csvfile,
                                   results_list[0].keys(),
                                   extrasaction='ignore')
                w.writeheader()
                w.writerows(results_list)

    page_i = get_page_i(url)
    http_collector = None
    try:
        http_collector = get_collector('http-collector-01')
    except:
        pass
    if http_collector == None:
        http_collector = make_collector(page_i)
    http = create_new_session(http_collector)

    if find_pages:
        return how_many_pages(url, http, http_collector, page_size)

    page = get_page(url, http, http_collector)
    results_list = get_results_list(page, http, http_collector)
    write_csv(results_list, path)
    return
def get_proxy_list():
    collector = get_collector('list-collector')
    proxie_collection = collector.get_proxies({'anonymous': True})
    return list(i.host + ':' + i.port for i in proxie_collection)