def test_post_with_headers(henry_post_bucket): r = ProxyRequests(henry_post_bucket + '/post') r.set_headers({'name': 'rootVIII', 'secret_message': '7Yufs9KIfj33d'}) r.post_with_headers({'key1': 'value1', 'key2': 'value2'}) assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(r.get_proxy_used())
def gather_info(url): list_of_user_agents = [ 'Mozilla/5.0', 'AppleWebKit/537.36', 'Chrome/79.0.3945.88', 'Safari/537.36' ] stat_code = 0 tag_info = {'url': url} try_count = 0 # continue attempting up to 4 proxies for user_agent in list_of_user_agents: if stat_code != 200: try_count += 1 headers = { "User-Agent": user_agent, "Accept": "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\ q = 0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en; q = 0.9" } r = ProxyRequests(url) r.set_headers(headers) r.get_with_headers() source = r.get_raw() stat_code = r.get_status_code() if try_count == len(list_of_user_agents): tag_info['num_of_changed_files'] = -1 tag_info['changed_paths'] = ['ERROR, CANNOT FULFILL REQUEST'] tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS' tag_info['metrics'] = { 'num_of_changed_files': 0, 'changes': 0, 'additions': 0, 'deletions': 0 } return tag_info # proxy successful, continue reading the page if stat_code == 200: soup = BeautifulSoup(source, 'lxml') metrics = get_changed_files_metrics(soup) tag_info['metrics'] = metrics count, changed_files = get_changed_files(soup) if count == 0: tag_info['changed_paths'] = ['NONE FOUND'] else: tag_info['changed_paths'] = changed_files if count != tag_info['metrics']['num_of_changed_files']: tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT' else: tag_info['error_found'] = 'NONE' return tag_info
def test_get_with_headers(): h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'} r = ProxyRequests('https://postman-echo.com/headers') r.set_headers(h) r.get_with_headers() assert r.get_status_code() == 200 assert 'headers' in r.get_json() print(r.get_proxy_used())
def test_get(): r = ProxyRequests('https://api.ipify.org') r.get() assert r.get_status_code() == 200 try: inet_aton(r.__str__()) except Exception: pytest.fail('Invalid IP address in response') print(r.get_proxy_used())
def test_post_file(henry_post_bucket): with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out: f_out.write('testing') r = ProxyRequests(henry_post_bucket + '/post') r.set_file('/var/tmp/proxy_requests_testing.txt') r.post_file() assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(henry_post_bucket) print(r.get_proxy_used())
def test_post_file_with_headers(henry_post_bucket): with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out: f_out.write('testing') h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'} r = ProxyRequests(henry_post_bucket + '/post') r.set_headers(h) r.set_file('/var/tmp/proxy_requests_testing.txt') r.post_file_with_headers() assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(henry_post_bucket) print(r.get_proxy_used())
def fetch_with_proxy(url, headers): r = ProxyRequests(url) if headers: r.set_headers(headers) r.get_with_headers() else: r.get() status_code = r.get_status_code() if status_code != 200: print(f"{status_code}: {url}") return r.get_raw()
def thread_get_info(url): stat_code = 0 this_tag_info = {} this_tag_info['url'] = url try_count = 0 # continue collecting proxies for up to 10 tries while stat_code != 200: try_count += 1 if try_count > 10: this_tag_info['num_changed_files'] = -1 this_tag_info['changed_paths'] = ['NONE FOUND'] this_tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS' return this_tag_info headers = { "User-Agent": "Mozilla/5.0", "Accept": "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\ q = 0.8, application/signed-exchange; v = b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en; q = 0.9" } r = ProxyRequests(url) r.set_headers(headers) r.get_with_headers() source = r.get_raw() stat_code = r.get_status_code() # proxy successful, continue reading the page if stat_code == 200: soup = bs.BeautifulSoup(source, 'lxml') # get changed files info read_count = get_num_changed_files(soup) this_tag_info['num_changed_files'] = read_count count, changed_files = get_changed_files(soup) if count == 0: this_tag_info['changed_paths'] = ['NONE FOUND'] else: this_tag_info['changed_paths'] = changed_files if count != read_count: this_tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT' else: this_tag_info['error_found'] = 'OK' return this_tag_info
def crawl_img(image_row): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] print(asin) r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object) #df_imgs = df_imgs.append(df_img) #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg") print("Successfully crawled image: %s" % (asin)) else: print("Could not crawl image: %s" % (asin))
def test_post(henry_post_bucket): r = ProxyRequests(henry_post_bucket + '/post') r.post({'key1': 'value1', 'key2': 'value2'}) assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(r.get_proxy_used())
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_images', default=10, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_images = args.number_images # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) # if number_images is equal to 0, evry image should be crawled if number_images == 0: number_images = len(df_images) for j, image_row in df_images.iloc[0:number_images].iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg") df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_img['timestamp'] = df_img['timestamp'].astype('datetime64') df_img.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") print("Successfully crawled image: %s | %s of %s" % (asin, j + 1, number_images)) else: print("Could not crawl image: %s | %s of %s" (asin, j + 1, number_images)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0
import requests import random import string import random import time from proxy_requests import ProxyRequests headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71' } while True: cas = random.randint(0, 500) #from proxy_requests import ProxyRequests #register_data = {"POST":"/vote/479 HTTP/1.1"} #print (register_data) #url = 'https://souteze.rajce.net/vote/479' #r = s.post(url, json=register_data, headers=headers, proxies = proxies) #print(r.content) r = ProxyRequests("https://souteze.rajce.net/vote/504") r.post({"POST": "/vote/504 HTTP/1.1"}) print(r) print(r.get_status_code()) print(r.get_proxy_used()) #time.sleep(cas)