def test_post_with_headers(henry_post_bucket): r = ProxyRequests(henry_post_bucket + '/post') r.set_headers({'name': 'rootVIII', 'secret_message': '7Yufs9KIfj33d'}) r.post_with_headers({'key1': 'value1', 'key2': 'value2'}) assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(r.get_proxy_used())
def test_get_with_headers(): h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'} r = ProxyRequests('https://postman-echo.com/headers') r.set_headers(h) r.get_with_headers() assert r.get_status_code() == 200 assert 'headers' in r.get_json() print(r.get_proxy_used())
def test_get(): r = ProxyRequests('https://api.ipify.org') r.get() assert r.get_status_code() == 200 try: inet_aton(r.__str__()) except Exception: pytest.fail('Invalid IP address in response') print(r.get_proxy_used())
def test_post_file(henry_post_bucket): with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out: f_out.write('testing') r = ProxyRequests(henry_post_bucket + '/post') r.set_file('/var/tmp/proxy_requests_testing.txt') r.post_file() assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(henry_post_bucket) print(r.get_proxy_used())
def test_post_file_with_headers(henry_post_bucket): with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out: f_out.write('testing') h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'} r = ProxyRequests(henry_post_bucket + '/post') r.set_headers(h) r.set_file('/var/tmp/proxy_requests_testing.txt') r.post_file_with_headers() assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(henry_post_bucket) print(r.get_proxy_used())
def all_team_names(url_root): url = os.path.join(url_root, "teams") + "/" r = ProxyRequests(url) r.get() # print ip used print(r.get_proxy_used()) soup = BeautifulSoup(r.get_raw(), "html.parser") tabs = soup.find_all("table") # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list df_active = pd.read_html(tabs[0].prettify())[0] # filter to max years, which is the main franchise. Do you need this? # Extract all the hrefs for the active teams: team_a_links = tabs[0].find_all("a", href=True) team_names = { t["href"].replace("teams", "").replace("/", ""): t.text for t in team_a_links if "/teams/" in t["href"] } return team_names
def crawl_img(image_row): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] print(asin) r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object) #df_imgs = df_imgs.append(df_img) #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg") print("Successfully crawled image: %s" % (asin)) else: print("Could not crawl image: %s" % (asin))
def test_post(henry_post_bucket): r = ProxyRequests(henry_post_bucket + '/post') r.post({'key1': 'value1', 'key2': 'value2'}) assert r.get_status_code() == 200 assert 'Thank you' in r.__str__() print(r.get_proxy_used())
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_images', default=10, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_images = args.number_images # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) # if number_images is equal to 0, evry image should be crawled if number_images == 0: number_images = len(df_images) for j, image_row in df_images.iloc[0:number_images].iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg") df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_img['timestamp'] = df_img['timestamp'].astype('datetime64') df_img.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") print("Successfully crawled image: %s | %s of %s" % (asin, j + 1, number_images)) else: print("Could not crawl image: %s | %s of %s" (asin, j + 1, number_images)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0
from proxy_requests import ProxyRequests from proxy_requests import ProxyRequests, ProxyRequestsBasicAuth import requests import pandas as pd from bs4 import BeautifulSoup page = requests.get( 'https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XiXcNlNKhQI' ) #print(page.status_code) soup = BeautifulSoup(page.content, 'html.parser') #print(soup) week = soup.find(id='seven-day-forecast-body') # print(week) items = week.find_all(class_='tombstone-container') print(items[0]) url = 'https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XiXcNlNKhQI' r = ProxyRequests(url) r.get() r.get_json() r.get_proxy_used()
import requests import random import string import random import time from proxy_requests import ProxyRequests headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71' } while True: cas = random.randint(0, 500) #from proxy_requests import ProxyRequests #register_data = {"POST":"/vote/479 HTTP/1.1"} #print (register_data) #url = 'https://souteze.rajce.net/vote/479' #r = s.post(url, json=register_data, headers=headers, proxies = proxies) #print(r.content) r = ProxyRequests("https://souteze.rajce.net/vote/504") r.post({"POST": "/vote/504 HTTP/1.1"}) print(r) print(r.get_status_code()) print(r.get_proxy_used()) #time.sleep(cas)