コード例 #1
0
def test_post_with_headers(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers({'name': 'rootVIII', 'secret_message': '7Yufs9KIfj33d'})
    r.post_with_headers({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
コード例 #2
0
def test_get_with_headers():
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests('https://postman-echo.com/headers')
    r.set_headers(h)
    r.get_with_headers()
    assert r.get_status_code() == 200
    assert 'headers' in r.get_json()
    print(r.get_proxy_used())
コード例 #3
0
def test_get():
    r = ProxyRequests('https://api.ipify.org')
    r.get()
    assert r.get_status_code() == 200
    try:
        inet_aton(r.__str__())
    except Exception:
        pytest.fail('Invalid IP address in response')
    print(r.get_proxy_used())
コード例 #4
0
def test_post_file(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
コード例 #5
0
def test_post_file_with_headers(henry_post_bucket):
    with open('/var/tmp/proxy_requests_testing.txt', 'w') as f_out:
        f_out.write('testing')
    h = {'User-Agent': 'NCSA Mosaic/3.0 (Windows 95)'}
    r = ProxyRequests(henry_post_bucket + '/post')
    r.set_headers(h)
    r.set_file('/var/tmp/proxy_requests_testing.txt')
    r.post_file_with_headers()
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(henry_post_bucket)
    print(r.get_proxy_used())
コード例 #6
0
def all_team_names(url_root):
    url = os.path.join(url_root, "teams") + "/"
    r = ProxyRequests(url)
    r.get()
    # print ip used
    print(r.get_proxy_used())
    soup = BeautifulSoup(r.get_raw(), "html.parser")
    tabs = soup.find_all("table")
    # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list
    df_active = pd.read_html(tabs[0].prettify())[0]
    # filter to max years, which is the main franchise. Do you need this?

    # Extract all the hrefs for the active teams:
    team_a_links = tabs[0].find_all("a", href=True)
    team_names = {
        t["href"].replace("teams", "").replace("/", ""): t.text
        for t in team_a_links if "/teams/" in t["href"]
    }
    return team_names
コード例 #7
0
ファイル: wc_mba_images.py プロジェクト: Flo95x/mba-pipeline
    def crawl_img(image_row):
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        print(asin)
        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object)
            #df_imgs = df_imgs.append(df_img)
            #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg")

            print("Successfully crawled image: %s" % (asin))
        else:
            print("Could not crawl image: %s" % (asin))
コード例 #8
0
def test_post(henry_post_bucket):
    r = ProxyRequests(henry_post_bucket + '/post')
    r.post({'key1': 'value1', 'key2': 'value2'})
    assert r.get_status_code() == 200
    assert 'Thank you' in r.__str__()
    print(r.get_proxy_used())
コード例 #9
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_images',
        default=10,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_images = args.number_images

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    # if number_images is equal to 0, evry image should be crawled
    if number_images == 0:
        number_images = len(df_images)

    for j, image_row in df_images.iloc[0:number_images].iterrows():
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        url_image_lowq = image_row["url_image_lowq"]

        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
        #proxy_list = get_proxies("de", True)
        #proxy = next(iter(proxy_list))
        #proxies={"http": proxy, "https": proxy}

        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg",
                "mba-shirts/" + marketplace + "/" + asin + ".jpg")
            df_img = pd.DataFrame(data={
                "asin": [asin],
                "url": [
                    "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                    + marketplace + "/" + asin + ".jpg"
                ],
                "url_gs": [
                    "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                    marketplace + "/" + asin + ".jpg"
                ],
                "url_mba_lowq": [url_image_lowq],
                "url_mba_hq": [url_image_hq],
                "timestamp": [datetime.datetime.now()]
            },
                                  dtype=np.object)
            df_img['timestamp'] = df_img['timestamp'].astype('datetime64')
            df_img.to_gbq("mba_" + marketplace + ".products_images",
                          project_id="mba-pipeline",
                          if_exists="append")
            print("Successfully crawled image: %s | %s of %s" %
                  (asin, j + 1, number_images))
        else:
            print("Could not crawl image: %s | %s of %s" (asin, j + 1,
                                                          number_images))

        #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0
コード例 #10
0
from proxy_requests import ProxyRequests
from proxy_requests import ProxyRequests, ProxyRequestsBasicAuth
import requests
import pandas as pd
from bs4 import BeautifulSoup

page = requests.get(
    'https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XiXcNlNKhQI'
)
#print(page.status_code)
soup = BeautifulSoup(page.content, 'html.parser')
#print(soup)
week = soup.find(id='seven-day-forecast-body')
# print(week)

items = week.find_all(class_='tombstone-container')
print(items[0])
url = 'https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XiXcNlNKhQI'
r = ProxyRequests(url)
r.get()
r.get_json()
r.get_proxy_used()
コード例 #11
0
import requests
import random
import string
import random
import time
from proxy_requests import ProxyRequests

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 OPR/63.0.3368.71'
}

while True:
    cas = random.randint(0, 500)
    #from proxy_requests import ProxyRequests
    #register_data = {"POST":"/vote/479 HTTP/1.1"}
    #print (register_data)
    #url = 'https://souteze.rajce.net/vote/479'
    #r = s.post(url, json=register_data, headers=headers, proxies = proxies)
    #print(r.content)
    r = ProxyRequests("https://souteze.rajce.net/vote/504")
    r.post({"POST": "/vote/504 HTTP/1.1"})
    print(r)
    print(r.get_status_code())
    print(r.get_proxy_used())
    #time.sleep(cas)