Exemple #1
0
def test_post_with_headers(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers({'name': 'rootVIII', 'secret_message': '7Yufs9KIfj33d'})
    r.post_with_headers({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
def gather_info(url):
    list_of_user_agents = [
        'Mozilla/5.0', 'AppleWebKit/537.36', 'Chrome/79.0.3945.88',
        'Safari/537.36'
    ]
    stat_code = 0
    tag_info = {'url': url}

    try_count = 0
    # continue attempting up to 4 proxies
    for user_agent in list_of_user_agents:
        if stat_code != 200:
            try_count += 1

            headers = {
                "User-Agent": user_agent,
                "Accept":
                "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
                q = 0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en; q = 0.9"
            }

            r = ProxyRequests(url)
            r.set_headers(headers)
            r.get_with_headers()
            source = r.get_raw()
            stat_code = r.get_status_code()

    if try_count == len(list_of_user_agents):
        tag_info['num_of_changed_files'] = -1
        tag_info['changed_paths'] = ['ERROR, CANNOT FULFILL REQUEST']
        tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
        tag_info['metrics'] = {
            'num_of_changed_files': 0,
            'changes': 0,
            'additions': 0,
            'deletions': 0
        }
        return tag_info

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = BeautifulSoup(source, 'lxml')

        metrics = get_changed_files_metrics(soup)
        tag_info['metrics'] = metrics

        count, changed_files = get_changed_files(soup)
        if count == 0:
            tag_info['changed_paths'] = ['NONE FOUND']
        else:
            tag_info['changed_paths'] = changed_files

        if count != tag_info['metrics']['num_of_changed_files']:
            tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            tag_info['error_found'] = 'NONE'
    return tag_info
Exemple #3
0
def test_get_with_headers():
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests('https://postman-echo.com/headers')
    r.set_headers(h)
    r.get_with_headers()
    assert r.get_status_code() == 200
    assert 'headers' in r.get_json()
    print(r.get_proxy_used())
Exemple #4
0
def test_get():
    r = ProxyRequests('https://api.ipify.org')
    r.get()
    assert r.get_status_code() == 200
    try:
        inet_aton(r.__str__())
    except Exception:
        pytest.fail('Invalid IP address in response')
    print(r.get_proxy_used())
Exemple #5
0
def test_post_file(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
Exemple #6
0
def test_post_file_with_headers(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers(h)
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file_with_headers()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
def fetch_with_proxy(url, headers):
    r = ProxyRequests(url)
    if headers:
        r.set_headers(headers)
        r.get_with_headers()
    else:
        r.get()

    status_code = r.get_status_code()
    if status_code != 200:
        print(f"{status_code}: {url}")

    return r.get_raw()
def thread_get_info(url):
    stat_code = 0
    this_tag_info = {}
    this_tag_info['url'] = url

    try_count = 0
    # continue collecting proxies for up to 10 tries
    while stat_code != 200:
        try_count += 1
        if try_count > 10:
            this_tag_info['num_changed_files'] = -1
            this_tag_info['changed_paths'] = ['NONE FOUND']
            this_tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS'
            return this_tag_info

        headers = {
            "User-Agent": "Mozilla/5.0",
            "Accept":
            "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\
            q = 0.8, application/signed-exchange; v = b3",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en; q = 0.9"
        }

        r = ProxyRequests(url)
        r.set_headers(headers)
        r.get_with_headers()
        source = r.get_raw()
        stat_code = r.get_status_code()

    # proxy successful, continue reading the page
    if stat_code == 200:
        soup = bs.BeautifulSoup(source, 'lxml')

        # get changed files info
        read_count = get_num_changed_files(soup)
        this_tag_info['num_changed_files'] = read_count

        count, changed_files = get_changed_files(soup)
        if count == 0:
            this_tag_info['changed_paths'] = ['NONE FOUND']
        else:
            this_tag_info['changed_paths'] = changed_files

        if count != read_count:
            this_tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT'
        else:
            this_tag_info['error_found'] = 'OK'

    return this_tag_info
Exemple #9
0
    def crawl_img(image_row):
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        print(asin)
        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object)
            #df_imgs = df_imgs.append(df_img)
            #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg")

            print("Successfully crawled image: %s" % (asin))
        else:
            print("Could not crawl image: %s" % (asin))
Exemple #10
0
def test_post(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.post({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_images',
        default=10,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_images = args.number_images

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    # if number_images is equal to 0, evry image should be crawled
    if number_images == 0:
        number_images = len(df_images)

    for j, image_row in df_images.iloc[0:number_images].iterrows():
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        url_image_lowq = image_row["url_image_lowq"]

        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
        #proxy_list = get_proxies("de", True)
        #proxy = next(iter(proxy_list))
        #proxies={"http": proxy, "https": proxy}

        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg",
                "mba-shirts/" + marketplace + "/" + asin + ".jpg")
            df_img = pd.DataFrame(data={
                "asin": [asin],
                "url": [
                    "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                    + marketplace + "/" + asin + ".jpg"
                ],
                "url_gs": [
                    "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                    marketplace + "/" + asin + ".jpg"
                ],
                "url_mba_lowq": [url_image_lowq],
                "url_mba_hq": [url_image_hq],
                "timestamp": [datetime.datetime.now()]
            },
                                  dtype=np.object)
            df_img['timestamp'] = df_img['timestamp'].astype('datetime64')
            df_img.to_gbq("mba_" + marketplace + ".products_images",
                          project_id="mba-pipeline",
                          if_exists="append")
            print("Successfully crawled image: %s | %s of %s" %
                  (asin, j + 1, number_images))
        else:
            print("Could not crawl image: %s | %s of %s" (asin, j + 1,
                                                          number_images))

        #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0
Exemple #12
0
import requests
import random
import string
import random
import time
from proxy_requests import ProxyRequests

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71'
}

while True:
    cas = random.randint(0, 500)
    #from proxy_requests import ProxyRequests
    #register_data = {"POST":"/vote/479 HTTP/1.1"}
    #print (register_data)
    #url = 'https://souteze.rajce.net/vote/479'
    #r = s.post(url, json=register_data, headers=headers, proxies = proxies)
    #print(r.content)
    r = ProxyRequests("https://souteze.rajce.net/vote/504")
    r.post({"POST": "/vote/504 HTTP/1.1"})
    print(r)
    print(r.get_status_code())
    print(r.get_proxy_used())
    #time.sleep(cas)