def parse_proxyList(self):
        curr_proxy_list = []
        try:
            # Parse all proxy pages -> format: /list/{num}.htm
            # Get the pageRange from the 'pagination' table
            page_set = self.get_pagination_set()
            logger.debug("Pages: {}".format(page_set))
            # One JS unpacker per provider (not per page)
            self.js_unpacker = self.init_js_unpacker()

            for page in page_set:
                response = requests.get("{0}{1}".format(self.get_url(), page),
                                        timeout=self.timeout)
                if not response.ok:
                    # Could not parse ANY page - Let user know
                    if not curr_proxy_list:
                        logger.warning("Proxy Provider url failed: {}".format(
                            self.get_url()))
                    # Return proxies parsed so far
                    return curr_proxy_list
                content = response.content
                soup = BeautifulSoup(content,
                                     "html.parser",
                                     from_encoding="iso-8859-1")

                table = soup.find("div", attrs={"id": "proxylist"})
                # The first tr contains the field names.
                headings = [
                    th.get_text() for th in table.find("tr").find_all("th")
                ]
                # skip last 'Select All' row
                for row in table.find_all("tr")[1:-1]:
                    td_row = row.find("td")
                    portKey = td_row.find('span', attrs={
                        'class': True
                    }).get('class')[0]
                    port = self.js_unpacker.get_port(portKey)
                    proxy_obj = self.create_proxy_object(row, port)
                    # Make sure it is a Valid Proxy Address
                    if proxy_obj is not None and UrlParser.valid_ip(
                            proxy_obj.ip) and UrlParser.valid_port(port):
                        curr_proxy_list.append(proxy_obj)
                    else:
                        logger.debug("Proxy Invalid: {}".format(
                            proxy_obj.to_str()))
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
        finally:
            return curr_proxy_list
Esempio n. 2
0
class TestBaseProxyParsers(unittest.TestCase):
    def setUp(self):
        self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidth_KBs=50)
        self.no_bdwidthParser = UrlParser("slow-proxy", "http://slow-proxy.com")

    def test_normal_parser(self):
        self.assertEqual(self.normal_parser.get_url(), "http://proxy-test.com", "incorrect parser URL")
        self.assertEqual(self.normal_parser.get_min_bandwidth(), 50, "incorrect parser bandwidth")

    def test_no_bandwidth_parser(self):
        self.assertEqual(self.no_bdwidthParser.get_url(), "http://slow-proxy.com", "incorrect parser URL")
        self.assertEqual(self.no_bdwidthParser.get_min_bandwidth(), 150, "incorrect parser bandwidth")
Esempio n. 3
0
    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_URl(), timeout=self.timeout)

            if not response.ok:
                logger.warn("Proxy Provider url failed: {}".format(
                    self.get_URl()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            table = soup.find("table", attrs={"class": "proxy_list"})

            # The first tr contains the field names.
            headings = [
                th.get_text() for th in table.find("tr").find_all("th")
            ]

            datasets = []
            for row in table.find_all("tr")[1:]:
                dataset = zip(headings,
                              (td.get_text() for td in row.find_all("td")))
                datasets.append(dataset)

            for dataset in datasets:
                # Check Field[0] for tags and field[1] for values!
                address = ""
                proxy_straggler = False
                for field in dataset:
                    # Discard slow proxies! Speed is in KB/s
                    if field[0] == 'Speed':
                        if float(field[1]) < self.get_min_bandwidth():
                            proxy_straggler = True
                    if field[0] == 'IP':
                        # Make sure it is a Valid IP
                        if not UrlParser.valid_ip(field[1]):
                            logger.debug("IP with Invalid format: {}".format(
                                field[1]))
                            break
                        else:
                            address += field[1] + ':'
                    elif field[0] == 'Port':
                        address += field[1]
                # Avoid Straggler proxies and make sure it is a Valid Proxy Address
                if not proxy_straggler and UrlParser.valid_ip_port(address):
                    proxy = "http://" + address
                    curr_proxy_list.append(proxy.__str__())
                    # print "{0:<10}: {1}".format(field[0], field[1])
            # print "ALL: ", curr_proxy_list
        except:
            pass
        return curr_proxy_list
Esempio n. 4
0
    def parse_proxyList(self, use_top15k=False):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_URl() + "/" + self.top_proxy_path,
                                    timeout=self.timeout)

            if not response.ok:
                logger.warn("Proxy Provider url failed: {}".format(
                    self.get_URl()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            table = soup.find("div",
                              attrs={
                                  "class": "paragraph",
                                  'style': "text-align:left;"
                              }).find('font', attrs={'color': '#33a27f'})
            # Parse Top Proxy List page
            for row in [
                    x for x in table.contents
                    if getattr(x, 'name', None) != 'br'
            ]:
                # Make sure it is a Valid Proxy Address
                if UrlParser.valid_ip_port(row):
                    proxy = "http://" + row
                    curr_proxy_list.append(proxy.__str__())
                else:
                    logger.debug("Address with Invalid format: {}".format(row))
            # Usually these proxies are stale
            if use_top15k:
                # Parse 15k Nodes Text file (named *-all-*.txt)
                content = requests.get(self.get_URl() + "/" +
                                       self.txt_proxy_path).content
                soup = BeautifulSoup(content, "html.parser")
                table = soup.find("div",
                                  attrs={"class": "wsite-multicol-table-wrap"})
                for link in table.findAll('a'):
                    current_link = link.get('href')
                    if current_link is not None and "all" in current_link:
                        self.txt_proxy_path = current_link
                more_content = requests.get(self.get_URl() +
                                            self.txt_proxy_path).text
                for proxy_address in more_content.split():
                    if UrlParser.valid_ip_port(proxy_address):
                        curr_proxy_list.append(proxy_address)
        except:
            pass
        return curr_proxy_list
Esempio n. 5
0
    def parse_proxyList(self):
        curr_proxy_list = []
        response = requests.get(self.get_URl(), timeout=self.timeout)

        if not response.ok:
            logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
            return []

        content = response.content
        soup = BeautifulSoup(content, "html.parser")
        table = soup.find("table", attrs={"class": "display fpltable"})

        if table is None:
            return curr_proxy_list

        # The first tr contains the field names.
        headings = [th.get_text() for th in table.find("tr").find_all("th")]

        datasets = []
        for row in table.find_all("tr")[1:]:
            dataset = zip(headings,
                          (td.get_text() for td in row.find_all("td")))
            if dataset:
                datasets.append(dataset)

        for dataset in datasets:
            # Check Field[0] for tags and field[1] for values!
            address = ""
            for field in dataset:
                if field[0] == 'IP Address':
                    # Make sure it is a Valid IP
                    if not UrlParser.valid_ip(field[1]):
                        logger.debug("IP with Invalid format: {}".format(
                            field[1]))
                        break
                    else:
                        address += field[1] + ':'
                elif field[0] == 'Port':
                    address += field[1]
            # Make sure it is a Valid Proxy Address
            if UrlParser.valid_ip_port(address):
                proxy = "http://" + address
                curr_proxy_list.append(proxy.__str__())
            else:
                logger.debug("Address with Invalid format: {}".format(address))
            # print "{0:<10}: {1}".format(field[0], field[1])
        # print "ALL: ", curr_proxy_list
        return curr_proxy_list
Esempio n. 6
0
 def create_proxy_object(self, dataset):
     ip = ""
     port = None
     anonymity = AnonymityLevel.UNKNOWN
     country = None
     # Check Field[0] for tags and field[1] for values!
     for field in dataset:
         # Discard slow proxies! Speed is in KB/s
         if field[0] == 'Speed':
             if float(field[1]) < self.get_min_bandwidth():
                 logger.debug("Proxy with low bandwidth: {}".format(float(field[1])))
                 return None
         if field[0] == 'IP':
             ip = field[1].strip()  # String strip()
             # Make sure it is a Valid IP
             if not UrlParser.valid_ip(ip):
                 logger.debug("IP with Invalid format: {}".format(ip))
                 return None
         elif field[0] == 'Port':
             port = field[1].strip()  # String strip()
         elif field[0] == 'Anon':
             anonymity = AnonymityLevel.get(field[1].strip())  # String strip()
         elif field[0] == 'Country':
             country = field[1].strip()  # String strip()
     return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
Esempio n. 7
0
 def create_proxy_object(self, dataset):
     # Check Field[0] for tags and field[1] for values!
     ip = ""
     port = None
     anonymity = AnonymityLevel.UNKNOWN
     country = None
     for field in dataset:
         if field[0] == 'IP Address':
             # Make sure it is a Valid IP
             ip = field[1].strip()  # String strip()
             # Make sure it is a Valid IP
             if not UrlParser.valid_ip(ip):
                 logger.debug("IP with Invalid format: {}".format(ip))
                 return None
         elif field[0] == 'Port':
             port = field[1].strip()  # String strip()
         elif field[0] == 'Anonymity':
             anonymity = AnonymityLevel.get(
                 field[1].strip())  # String strip()
         elif field[0] == 'Country':
             country = field[1].strip()  # String strip()
     return ProxyObject(source=self.id,
                        ip=ip,
                        port=port,
                        anonymity_level=anonymity,
                        country=country)
Esempio n. 8
0
    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            # Parse all proxy pages -> format: /list/{num}.htm
            # Get the pageRange from the 'pagination' table
            page_set = self.get_pagination_set()
            logger.debug("Pages: {}".format(page_set))
            for page in page_set:
                response = requests.get("{0}{1}".format(self.get_url(), page),
                                        timeout=self.timeout)
                if not response.ok:
                    # Could not parse ANY page - Let user know
                    if not curr_proxy_list:
                        logger.warn("Proxy Provider url failed: {}".format(
                            self.get_url()))
                    # Return proxies parsed so far
                    return curr_proxy_list
                content = response.content
                soup = BeautifulSoup(content, "html.parser")
                # css provides the port number so we reverse it
                # for href in soup.findAll('link'):
                #     if '/styles/' in href.get('href'):
                #         style = "http://www.samair.ru" + href.get('href')
                #         break
                # css = requests.get(style).content.split('\n')
                # css.pop()
                # ports = {}
                # for l in css:
                #     p = l.split(' ')
                #     key = p[0].split(':')[0][1:]
                #     value = p[1].split('\"')[1]
                #     ports[key] = value

                table = soup.find("div", attrs={"id": "proxylist"})
                # The first tr contains the field names.
                headings = [
                    th.get_text() for th in table.find("tr").find_all("th")
                ]
                for row in table.find_all("tr")[1:]:
                    td_row = row.find("td")
                    # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
                    proxy_obj = self.create_proxy_object(row)
                    # Make sure it is a Valid Proxy Address
                    if proxy_obj is not None and UrlParser.valid_ip_port(
                            td_row.text):
                        curr_proxy_list.append(proxy_obj)
                    else:
                        logger.debug("Proxy Invalid: {}".format(td_row.text))
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
        finally:
            return curr_proxy_list
Esempio n. 9
0
    def create_proxy_object(self, address, country, anonymity):
        # Make sure it is a Valid IP
        ip = address.strip().split(":")[0]
        if not UrlParser.valid_ip(ip):
            logger.debug("IP with Invalid format: {}".format(ip))
            return None
        port = address.strip().split(":")[1]
        country = country.strip()
        anonymity = AnonymityLevel.get(anonymity.strip())

        return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
 def create_proxy_object(self, row):
     for td_row in row.findAll("td"):
         if td_row.attrs['data-label'] == 'IP:port ':
             text = td_row.text.strip()
             ip = text.split(":")[0]
             # Make sure it is a Valid IP
             if not UrlParser.valid_ip(ip):
                 logger.debug("IP with Invalid format: {}".format(ip))
                 return None
             port = text.split(":")[1]
         elif td_row.attrs['data-label'] == 'Anonymity Type: ':
             anonymity = AnonymityLevel.get(td_row.text.strip())
         elif td_row.attrs['data-label'] == 'Country: ':
             country = td_row.text.strip()
     return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
Esempio n. 11
0
    def parse_proxyList(self):
        curr_proxy_list = []
        # Parse all proxy pages -> format: /list/{num}.htm
        # TODO: get the pageRange from the 'pagination' table
        for page in range(1, 21):
            response = requests.get("{0}{num:02d}.htm".format(self.get_URl(),
                                                              num=page),
                                    timeout=self.timeout)
            if not response.ok:
                # Could not parse ANY page - Let user know
                if not curr_proxy_list:
                    logger.warn("Proxy Provider url failed: {}".format(
                        self.get_URl()))
                # Return proxies parsed so far
                return curr_proxy_list
            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            # css provides the port number so we reverse it
            # for href in soup.findAll('link'):
            #     if '/styles/' in href.get('href'):
            #         style = "http://www.samair.ru" + href.get('href')
            #         break
            # css = requests.get(style).content.split('\n')
            # css.pop()
            # ports = {}
            # for l in css:
            #     p = l.split(' ')
            #     key = p[0].split(':')[0][1:]
            #     value = p[1].split('\"')[1]
            #     ports[key] = value

            table = soup.find("div", attrs={"id": "proxylist"})
            # The first tr contains the field names.
            headings = [
                th.get_text() for th in table.find("tr").find_all("th")
            ]
            for row in table.find_all("tr")[1:]:
                td_row = row.find("td")
                # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
                # Make sure it is a Valid Proxy Address
                if UrlParser.valid_ip_port(td_row.text):
                    curr_proxy_list.append('http://' + td_row.text)
                else:
                    logger.debug("Address with Invalid format: {}".format(
                        td_row.text))
        return curr_proxy_list
Esempio n. 12
0
    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_url(), timeout=self.timeout)
            if not response.ok:
                logger.warn("Proxy Provider url failed: {}".format(
                    self.get_url()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            table = soup.find("table", attrs={"id": "proxylisttable"})

            # The first tr contains the field names.
            headings = [
                th.get_text() for th in table.find("tr").find_all("th")
            ]

            datasets = []
            for row in table.find_all("tr")[1:]:
                dataset = zip(headings,
                              (td.get_text() for td in row.find_all("td")))
                if dataset:
                    datasets.append(dataset)

            for dataset in datasets:
                proxy_obj = self.create_proxy_object(dataset)
                # Make sure it is a Valid Proxy Address
                if proxy_obj is not None and UrlParser.valid_ip_port(
                        proxy_obj.get_address()):
                    curr_proxy_list.append(proxy_obj)
                else:
                    logger.debug("Proxy Invalid: {}".format(dataset))
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
        finally:
            return curr_proxy_list
Esempio n. 13
0
    def parse_proxyList(self):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_URl(), timeout=self.timeout)

            if not response.ok:
                logger.warn("Proxy Provider url failed: {}".format(
                    self.get_URl()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            # css provides the port number so we reverse it
            # for href in soup.findAll('link'):
            #     if '/styles/' in href.get('href'):
            #         style = "http://www.samair.ru" + href.get('href')
            #         break
            # css = requests.get(style).content.split('\n')
            # css.pop()
            # ports = {}
            # for l in css:
            #     p = l.split(' ')
            #     key = p[0].split(':')[0][1:]
            #     value = p[1].split('\"')[1]
            #     ports[key] = value

            table = soup.find("table", attrs={"id": "proxylist"})
            # The first tr contains the field names.
            headings = [
                th.get_text() for th in table.find("tr").find_all("th")
            ]
            for row in table.find_all("tr")[1:]:
                td_row = row.find("td")
                # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
                # Make sure it is a Valid Proxy Address
                if UrlParser.valid_ip_port(td_row.text):
                    curr_proxy_list.append('http://' + td_row.text)
                else:
                    logger.debug("Address with Invalid format: {}".format(
                        td_row.text))
        except:
            pass
        return curr_proxy_list
 def __init__(self, web_url, bandwithdh=None, timeout=None):
     UrlParser.__init__(self, web_url, bandwithdh, timeout)
Esempio n. 15
0
 def __init__(self, web_url, timeout=None):
     UrlParser.__init__(self, web_url, timeout)
 def __init__(self, id, web_url, timeout=None):
     self.top_proxy_path = "proxy-list.html"
     self.txt_proxy_path = "txt-lists.html"
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
    def parse_proxyList(self, use_top15k=False):
        curr_proxy_list = []
        try:
            response = requests.get(self.get_url() + "/" + self.top_proxy_path,
                                    timeout=self.timeout)

            if not response.ok:
                logger.warn("Proxy Provider url failed: {}".format(
                    self.get_url()))
                return []

            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            all_divs = soup.findAll("div",
                                    attrs={
                                        "class": "paragraph",
                                        'style': "text-align:left;"
                                    })
            # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
            # .find('font', attrs={'color': '#33a27f'})
            # Parse Top Proxy List page
            address_list = []
            country_list = []
            anonymity_list = []
            for div in all_divs:
                address_div = div.find('font', attrs={'color': '#33a27f'})
                if address_div is not None:
                    for row in [
                            x for x in address_div.contents
                            if getattr(x, 'name', None) != 'br'
                    ]:
                        address_list.append(str(row))
                curr_div = div.findAll('font', attrs={'size': '2'})
                if curr_div[0] is not None:
                    row_data = []
                    # font -> strong -> font
                    title = curr_div[0].contents[0].contents[0].contents[0]
                    for row in [
                            x for x in curr_div[-1].contents
                            if getattr(x, 'name', None) != 'br'
                    ]:
                        row_data.append(str(row))
                    if 'Country' in str(title):
                        country_list.extend(row_data)
                    if 'Status' in str(title):
                        anonymity_list.extend(row_data)
            for address, country, anonymity in zip(address_list, country_list,
                                                   anonymity_list):
                # Make sure it is a Valid Proxy Address
                proxy_obj = self.create_proxy_object(address, country,
                                                     anonymity)
                if proxy_obj is not None and UrlParser.valid_ip_port(
                        proxy_obj.get_address()):
                    curr_proxy_list.append(proxy_obj)
                else:
                    logger.debug("Proxy Invalid: {}".format(row))
            # Usually these proxies are stale
            if use_top15k:
                # Parse 15k Nodes Text file (named *-all-*.txt)
                content = requests.get(self.get_url() + "/" +
                                       self.txt_proxy_path).content
                soup = BeautifulSoup(content, "html.parser")
                table = soup.find("div",
                                  attrs={"class": "wsite-multicol-table-wrap"})
                for link in table.findAll('a'):
                    current_link = link.get('href')
                    if current_link is not None and "all" in current_link:
                        self.txt_proxy_path = current_link
                more_content = requests.get(self.get_url() +
                                            self.txt_proxy_path).text
                for proxy_address in more_content.split():
                    if UrlParser.valid_ip_port(proxy_address):
                        proxy_obj = self.create_proxy_object(row)
                        curr_proxy_list.append(proxy_obj)
        except AttributeError as e:
            logger.error(
                "Provider {0} failed with Attribute error: {1}".format(
                    self.id, e))
        except KeyError as e:
            logger.error("Provider {0} failed with Key error: {1}".format(
                self.id, e))
        except Exception as e:
            logger.error("Provider {0} failed with Unknown error: {1}".format(
                self.id, e))
        finally:
            return curr_proxy_list
Esempio n. 18
0
 def setUp(self):
     self.normal_parser = UrlParser("proxy-test", "http://proxy-test.com", bandwidth_KBs=50)
     self.no_bdwidthParser = UrlParser("slow-proxy", "http://slow-proxy.com")
 def __init__(self, id, web_url, timeout=None):
     self.base_url = web_url
     web_url += "/list/"
     # Ports decoded by the JS unpacker
     self.js_unpacker = None
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)
Esempio n. 20
0
 def __init__(self, id, web_url, bandwidth=None, timeout=None):
     UrlParser.__init__(self, id=id, web_url=web_url, bandwidth_KBs=bandwidth, timeout=timeout)
Esempio n. 21
0
 def __init__(self, web_url, timeout=None):
     web_url += "/list/"
     UrlParser.__init__(self, web_url, timeout)
Esempio n. 22
0
 def __init__(self, id, web_url, timeout=None):
     UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)