def getRSS(curso): """ Downloads xml rss files from https://side.utad.pt Stores them into feeds/curso.xml :param curso: string :return: True """ if debug: print("getRSS", curso) feedRSS = "https://side.utad.pt/rss.pl?" + curso feedFile = "feeds/" + curso + ".xml" if path.exists(feedFile): remove(feedFile) try: r = ProxyRequests(feedRSS) r.get() with open(feedFile, 'wb') as f: f.write(r.get_raw()) if round(path.getsize(feedFile)) < 700: getRSS(curso) except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, requests.exceptions.ProxyError, urllib3.exceptions.MaxRetryError): getRSS(curso)
def gather_info(url): list_of_user_agents = [ 'Mozilla/5.0', 'AppleWebKit/537.36', 'Chrome/79.0.3945.88', 'Safari/537.36' ] stat_code = 0 tag_info = {'url': url} try_count = 0 # continue attempting up to 4 proxies for user_agent in list_of_user_agents: if stat_code != 200: try_count += 1 headers = { "User-Agent": user_agent, "Accept": "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\ q = 0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en; q = 0.9" } r = ProxyRequests(url) r.set_headers(headers) r.get_with_headers() source = r.get_raw() stat_code = r.get_status_code() if try_count == len(list_of_user_agents): tag_info['num_of_changed_files'] = -1 tag_info['changed_paths'] = ['ERROR, CANNOT FULFILL REQUEST'] tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS' tag_info['metrics'] = { 'num_of_changed_files': 0, 'changes': 0, 'additions': 0, 'deletions': 0 } return tag_info # proxy successful, continue reading the page if stat_code == 200: soup = BeautifulSoup(source, 'lxml') metrics = get_changed_files_metrics(soup) tag_info['metrics'] = metrics count, changed_files = get_changed_files(soup) if count == 0: tag_info['changed_paths'] = ['NONE FOUND'] else: tag_info['changed_paths'] = changed_files if count != tag_info['metrics']['num_of_changed_files']: tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT' else: tag_info['error_found'] = 'NONE' return tag_info
def fetch_with_proxy(url, headers): r = ProxyRequests(url) if headers: r.set_headers(headers) r.get_with_headers() else: r.get() status_code = r.get_status_code() if status_code != 200: print(f"{status_code}: {url}") return r.get_raw()
def thread_get_info(url): stat_code = 0 this_tag_info = {} this_tag_info['url'] = url try_count = 0 # continue collecting proxies for up to 10 tries while stat_code != 200: try_count += 1 if try_count > 10: this_tag_info['num_changed_files'] = -1 this_tag_info['changed_paths'] = ['NONE FOUND'] this_tag_info['error_found'] = 'ERROR, TOO MANY PROXY ATTEMPTS' return this_tag_info headers = { "User-Agent": "Mozilla/5.0", "Accept": "text/html, application/xhtml+xml, application/xml; q = 0.9, image/webp,image/apng, */*;\ q = 0.8, application/signed-exchange; v = b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en; q = 0.9" } r = ProxyRequests(url) r.set_headers(headers) r.get_with_headers() source = r.get_raw() stat_code = r.get_status_code() # proxy successful, continue reading the page if stat_code == 200: soup = bs.BeautifulSoup(source, 'lxml') # get changed files info read_count = get_num_changed_files(soup) this_tag_info['num_changed_files'] = read_count count, changed_files = get_changed_files(soup) if count == 0: this_tag_info['changed_paths'] = ['NONE FOUND'] else: this_tag_info['changed_paths'] = changed_files if count != read_count: this_tag_info['error_found'] = 'ERROR, MISMATCH IN COUNT' else: this_tag_info['error_found'] = 'OK' return this_tag_info
def __init__(self, query: str): """ Na inicialização é realizada a requisição com as headers, e obtendo a resposta JSON da mesma para permitir as demais propriedades. :param query: """ from urllib.parse import quote from proxy_requests import ProxyRequests import json headers = {"User-Agent": self.user_agent} req = ProxyRequests(self.RA_SEARCH.format(quote( query.encode("utf-8")))) req.set_headers(headers) req.get_with_headers() self.__response = json.loads(req.get_raw().decode())
def all_team_names(url_root): url = os.path.join(url_root, "teams") + "/" r = ProxyRequests(url) r.get() # print ip used print(r.get_proxy_used()) soup = BeautifulSoup(r.get_raw(), "html.parser") tabs = soup.find_all("table") # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list df_active = pd.read_html(tabs[0].prettify())[0] # filter to max years, which is the main franchise. Do you need this? # Extract all the hrefs for the active teams: team_a_links = tabs[0].find_all("a", href=True) team_names = { t["href"].replace("teams", "").replace("/", ""): t.text for t in team_a_links if "/teams/" in t["href"] } return team_names
def crawl_img(image_row): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] print(asin) r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object) #df_imgs = df_imgs.append(df_img) #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg") print("Successfully crawled image: %s" % (asin)) else: print("Could not crawl image: %s" % (asin))
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_images', default=10, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_images = args.number_images # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) # if number_images is equal to 0, evry image should be crawled if number_images == 0: number_images = len(df_images) for j, image_row in df_images.iloc[0:number_images].iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg") df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_img['timestamp'] = df_img['timestamp'].astype('datetime64') df_img.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") print("Successfully crawled image: %s | %s of %s" % (asin, j + 1, number_images)) else: print("Could not crawl image: %s | %s of %s" (asin, j + 1, number_images)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0