Esempio n. 1
0
def save_new_proxy_record(one_proxy_dict_data):
    '''Examine the given single proxy dict object and store into "new" collection
    '''
    db = client["proxypool"]

    new_storage = db["new"]
    all_storage = db["all"]

    # If historical collection contains the proxy information (Key: ip), then ignore
    if len([i
            for i in all_storage.find({"ip": one_proxy_dict_data["ip"]})]) > 0:
        log_print("Found duplicate proxy for " + one_proxy_dict_data["ip"] +
                  ", ignore...")
    else:
        log_print("Store " + one_proxy_dict_data["ip"])
        new_storage.insert(one_proxy_dict_data)
        all_storage.insert({"ip": one_proxy_dict_data["ip"]})
Esempio n. 2
0
# url_pages are a list of strings to be inserted into the url template {NUM} location
url_pages = [""] + [i for i in range(1, 3)]

# xpath rules to locate data fields
xpath = {
    "ip": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[1]/font[2]",
    "port":
    "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[1]/font[2]",
    "protocol":
    "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[2]/a/font[1]",
    "country": "/html/body/table[2]/tbody/tr[4]/td/table/tbody/tr[*]/td[4]/a",
}

# Lambda functions to further extract the information
extractor = {
    "ip": lambda text: text.split(":")[0],
    "port": lambda text: text.split(":")[1],
    "protocol": lambda text: text,
    "country": lambda text: text,
}

while True:
    proxy_list_of_dicts = scrape(url_template, url_pages, xpath, extractor)

    for proxy_dict in proxy_list_of_dicts:
        save_new_proxy_record(proxy_dict)

    log_print("Finished one round of scraping, sleep for " +
              str(time_interval) + " seconds")
    sleep(time_interval)
Esempio n. 3
0
def scrape(url_template, url_pages, xpath, extractor, sleep_before_scrape=0):
    '''Function for configuring metrics and scraping the target website via given defined rules.
    Parameters
    ----------
    url_template : string
        Url base string, with {NUM} indicating the placeholder for different pages
    url_pages : list
        A list of string or integer to insert into url_template {NUM}
    xpath : dict
        Containing xpath rules for ip, ports, protocol, country
    extractor : dict
        Used for further extraction for strings extracted from the xpath rules
    sleep_time : integer
        Time in seconds to wait before starting the xpath extraction

    Return
    ---------
    Proxy information as a list of dict objects
    '''
    log_print("Scraping starts...")

    # Generate all urls to iterate through
    urls = [
        url_template.replace("{NUM}", str(page_num)) for page_num in url_pages
    ]
    ips, ports, protocols, countries = [], [], [], []

    # Init Chrome Webdriver
    driver = webdriver.PhantomJS(service_args=["--webdriver-loglevel=NONE"])

    # Set Viewport
    driver.set_window_size(1920, 1080)

    for url in urls:
        log_print("Fetching " + url)
        driver.get(url)

        sleep(sleep_before_scrape)

        ips += [
            extractor["ip"](ip_element.text)
            for ip_element in driver.find_elements_by_xpath(xpath["ip"])
        ]
        ports += [
            extractor["port"](port_element.text)
            for port_element in driver.find_elements_by_xpath(xpath["port"])
        ]
        protocols += [
            extractor["protocol"](protocol_element.text) for protocol_element
            in driver.find_elements_by_xpath(xpath["protocol"])
        ]
        countries += [
            extractor["country"](country_element.text) for country_element in
            driver.find_elements_by_xpath(xpath["country"])
        ]

    html = driver.page_source

    # Close the selenium driver to prevent memory leaking
    driver.close()

    if len(ips) != len(ports) != len(protocols) != len(countries):
        log_print("Error! Number of data fields collected mismatch: " +
                  str(len(ips)) + " " + str(len(ports)) + " " +
                  str(len(protocols)) + str(len(countries)))
        exit()

    if len(ips) == 0:
        log_print("Something went wrong, there are no proxies fetched...")
        log_print(html)
        exit()

    index_to_be_deleted = []

    # Filters out the proxies that only supports SOCKS protocol
    for i in range(len(ips)):
        protocol = check_protocol(protocols[i])
        if not protocol:
            index_to_be_deleted.append(i)
        else:
            protocols[i] = protocol

    # Iterate through indexes and delete them
    for i in index_to_be_deleted:
        del ips[i]
        del ports[i]
        del protocols[i]
        del countries[i]

    log_print("Fetched total " + str(len(ips)) + " proxies")
    return _make_dicts(ips, ports, protocols, countries)
Esempio n. 4
0
from pymongo import MongoClient
from loglib import log_print

log_print("Initialize MongoDB Connection...")
client = MongoClient("", 27017, authSource='admin', username="", password="")


def save_new_proxy_record(one_proxy_dict_data):
    '''Examine the given single proxy dict object and store into "new" collection
    '''
    db = client["proxypool"]

    new_storage = db["new"]
    all_storage = db["all"]

    # If historical collection contains the proxy information (Key: ip), then ignore
    if len([i
            for i in all_storage.find({"ip": one_proxy_dict_data["ip"]})]) > 0:
        log_print("Found duplicate proxy for " + one_proxy_dict_data["ip"] +
                  ", ignore...")
    else:
        log_print("Store " + one_proxy_dict_data["ip"])
        new_storage.insert(one_proxy_dict_data)
        all_storage.insert({"ip": one_proxy_dict_data["ip"]})