def getRSS(curso): """ Downloads xml rss files from https://side.utad.pt Stores them into feeds/curso.xml :param curso: string :return: True """ if debug: print("getRSS", curso) feedRSS = "https://side.utad.pt/rss.pl?" + curso feedFile = "feeds/" + curso + ".xml" if path.exists(feedFile): remove(feedFile) try: r = ProxyRequests(feedRSS) r.get() with open(feedFile, 'wb') as f: f.write(r.get_raw()) if round(path.getsize(feedFile)) < 700: getRSS(curso) except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, requests.exceptions.ProxyError, urllib3.exceptions.MaxRetryError): getRSS(curso)
def test_get(): r = ProxyRequests('https://api.ipify.org') r.get() assert r.get_status_code() == 200 try: inet_aton(r.__str__()) except Exception: pytest.fail('Invalid IP address in response') print(r.get_proxy_used())
def parse_person(): data = {} for person in models.Person.query.all(): #.filter_by(name_original=None) while True: try: r = ProxyRequests(f'{URL}{person.links}') except: break r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text, 'html.parser') if not soup.find('h1', {'itemprop': 'name'}): continue alternateName = soup.find('span', {'itemprop': 'alternateName'}) if alternateName: person.name_original = alternateName.text else: person.name_original = person.name db.session.add(person) db.session.commit() list_career = [] director = soup.find('a', {'href': '#director'}) if director: egge = director.text.replace(' ', '') if not models.Career.query.filter_by(name=egge).first(): new_career = models.Career(name=egge) db.session.add(new_career) db.session.commit() list_career.append(new_career) else: list_career.append( models.Career.query.filter_by(name=egge).first()) actor = soup.find('a', {'href': '#actor'}) if actor: egge = actor.text.replace(' ', '') if not models.Career.query.filter_by(name=egge).first(): new_career = models.Career(name=egge) db.session.add(new_career) db.session.commit() list_career.append(new_career) else: list_career.append( models.Career.query.filter_by(name=egge).first()) person.career.clear() for i in list_career: person.career.append(i) db.session.add(person) db.session.commit() break
def fetch_with_proxy(url, headers): r = ProxyRequests(url) if headers: r.set_headers(headers) r.get_with_headers() else: r.get() status_code = r.get_status_code() if status_code != 200: print(f"{status_code}: {url}") return r.get_raw()
def get_current(): url = "https://acrnm.com" site = ProxyRequests(url) failures = 0 while True: print("Checking if new products are on ACRNM on proxy: {}".format(site.proxy_used)) if not site.get().ok: print("Proxy or website is unresponsive. Trying again...") failures += 1 site.proxy_used = site.sockets.pop(0) continue else: failures = 0 tree = html.fromstring(str(site)) tree.make_links_absolute(url) prod_names = tree.xpath("//div[@class='name']/text()") prod_urls = tree.xpath("//a[contains(concat(' ', normalize-space(@class), ' '), ' tile ')]/@href") new, restock = db.new_items(prod_names, prod_urls) if new: new = list(zip(*new)) notify(new[1], restock) db.insert_products(new[0]) else: notify(new, restock) db.insert_current(prod_names, prod_urls)
def cricket(mid): while True: try: r = ProxyRequests('http://mapps.cricbuzz.com/cbzios/match/' + mid + '/leanback.json') r.get() a = str(r) data = json.loads(a) bat = data['bat_team']['name'] bow = data['bow_team']['name'] score = int(data["comm_lines"][0]["score"]) wicket = int(data["comm_lines"][0]["wkts"]) over = float(data['bat_team']['innings'][0]['overs']) detailed_score = data["comm_lines"][0]["score"] + "/" + data[ "comm_lines"][0]["wkts"] + " " + data['bat_team']['innings'][ 0]['overs'] try: bowler = data['bowler'][0]['name'] batname0 = data['batsman'][0]['name'] batname1 = data['batsman'][1]['name'] bat0score = data['batsman'][0]['r'] bat1score = data['batsman'][1]['r'] bat0ball = data['batsman'][0]['b'] bat1ball = data['batsman'][1]['b'] bowler = bow + ":" + data['bowler'][0]['name'] batters = batname0 + "(" + bat0score + "-" + bat0ball + ")" + batname1 + "(" + bat1score + "-" + bat1ball + ")" detailed_score = bat + ":" + data["comm_lines"][0][ "score"] + "/" + data["comm_lines"][0][ "wkts"] + " " + data['bat_team']['innings'][0]['overs'] except: print( "An exception occurred fetching either batters or bowler") try: txt = bowler + " " + batters print(detailed_score + " " + txt) aio.send('message', detailed_score + " " + txt) except: print("An exception occurred sending") except: print("An exception occurred start") time.sleep(10) global stop_threads if stop_threads: print('Stopped Cricket') break
def all_team_names(url_root): url = os.path.join(url_root, "teams") + "/" r = ProxyRequests(url) r.get() # print ip used print(r.get_proxy_used()) soup = BeautifulSoup(r.get_raw(), "html.parser") tabs = soup.find_all("table") # active franchise: tabs[0] bc two tables on url, then pd_read_html returns a list df_active = pd.read_html(tabs[0].prettify())[0] # filter to max years, which is the main franchise. Do you need this? # Extract all the hrefs for the active teams: team_a_links = tabs[0].find_all("a", href=True) team_names = { t["href"].replace("teams", "").replace("/", ""): t.text for t in team_a_links if "/teams/" in t["href"] } return team_names
def listofMatches(): try: url='http://mapps.cricbuzz.com/cbzios/match/livematches' r = ProxyRequests(url) r.get() a=str(r) data=json.loads(a) matches=[] match_id=[] for i in data['matches']: matches.append(i) for i in matches: t= time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(i['header']['start_time']))) match_id.append(i['match_id']) print(i['match_id']+' '+ t +' '+i['series_name']) return match_id[0] except: print("An exception occurred auto updating mid") time.sleep(2) listofMatches()
def prefetch(): try: print("Pre-fetching") print(config.ur) r = ProxyRequests(config.ur) r.get() a=str(r) data=json.loads(a) config.series_name = data["series_name"] config.bat_team_name=data['bat_team']['name'] config.twicket=int(data["comm_lines"][0]["wkts"]) config.twicket=config.twicket+1 config.tover=int(float(data['bat_team']['innings'][0]['overs'])) config.tover=config.tover+1 config.series_name="--"+config.series_name+"--" print(config.series_name+'\n'+config.bat_team_name) except: print("An exception occurred prefetching") time.sleep(5) prefetch()
def crawl_img(image_row): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] print(asin) r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object) #df_imgs = df_imgs.append(df_img) #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg") print("Successfully crawled image: %s" % (asin)) else: print("Could not crawl image: %s" % (asin))
def codechecker(code): try: r = ProxyRequests("https://discordapp.com/api/v6/entitlements/gift-codes/$%s?with_application=false&with_subscription_plan=true" % (code)) r.get() JsonResponse = r.get_json() Response = JsonResponse["message"] if Response == "Unknown Gift Code": print(f"\x1b[31;1mInvaild Code {code}\n") return if Response == "You are being rate limited.": return print("\x1b[31;1mYou Are Being Rate Limited.") else: print(f"\x1b[31;1mFound Working Code {code} Site Response:{Response}\n") response = ProxyRequests("https://discordapp.com/api/v6/entitlements/gift-codes/{code}/redeem", json={"channel_id":str(message.channel.id)}, headers={'authorization':token}) redeemedcode = (response.text) return except Exception as e: print(e) return
def parse_links(): page = 1 last_page = 1 data = {} while page <= last_page: r = ProxyRequests(f'{URL}/top/navigator/m_act[rating]/1%3A/order/rating/page/{page}/#results') r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text) if last_page == 1: try: last_link = soup.find_all('li', {'class': 'arr'})[-1].find('a').get('href') last_page = int(re.findall(r'\d{2,}', last_link)[0]) except: continue movie_link = soup.find_all('div', {'class': '_NO_HIGHLIGHT_'}) if not movie_link: continue for i in movie_link: i_soup = BeautifulSoup(f'b{i}').find('div', {'class': 'name'}).find('a') i_text = i_soup.text i_link = i_soup.get('href') id_film = int(re.findall(r'\d{1,}', i_link)[1]) if models.Film.query.filter_by(id_film=id_film).first() == None: film = models.Film(id_film=id_film, links=i_link, name=i_text) db.session.add(film) try: db.session.commit() except Exception: db.session.rollback() data[i_text] = {page:i_link} continue page += 1 with open('data.txt', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False)
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_images', default=10, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_images = args.number_images # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) # if number_images is equal to 0, evry image should be crawled if number_images == 0: number_images = len(df_images) for j, image_row in df_images.iloc[0:number_images].iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg") df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_img['timestamp'] = df_img['timestamp'].astype('datetime64') df_img.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") print("Successfully crawled image: %s | %s of %s" % (asin, j + 1, number_images)) else: print("Could not crawl image: %s | %s of %s" (asin, j + 1, number_images)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0
def parse_films(): engine = create_engine('sqlite:///:memory:', echo=True) data = {} for film in models.Film.query.filter_by(rating_kp=None).all(): # while True: try: r = ProxyRequests(f'{URL}{film.links}') except: break r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text, 'html.parser') genres = soup.find('span', {'itemprop': 'genre'}) if genres: genres = genres.find_all('a') countrys = soup.find_all( 'div', {'style': 'position: relative'})[1].find_all('a') persons = soup.find_all('li', {'itemprop': 'actors'}) for director in soup.find_all('td', {'itemprop': 'director'}): persons.append(director) break list_genres = [] for genre in genres: if not models.Genre.query.filter_by(name=genre.text).first(): while True: new_genre = models.Genre(name=genre.text) db.session.add(new_genre) try: db.session.commit() list_genres.append(new_genre) break except Exception: db.session.rollback() else: list_genres.append( models.Genre.query.filter_by(name=genre.text).first()) list_countrys = [] for country in countrys: if not models.Country.query.filter_by(name=country.text).first(): while True: new_country = models.Country(name=country.text) db.session.add(new_country) try: db.session.commit() list_countrys.append(new_country) break except Exception: db.session.rollback() else: list_countrys.append( models.Country.query.filter_by(name=country.text).first()) list_person = [] for person in persons: if person.find('a').text.replace(' ', '') == '...': break person_link = person.find('a').get('href') if not models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first(): while True: # person_link = person.find('a').get('href') if models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first(): break id_person_kp = int(re.findall(r'\d{1,}', person_link)[0]) new_person = models.Person(name=person.text, links=person_link, id_person_kp=id_person_kp) db.session.add(new_person) try: db.session.commit() list_person.append(new_person) break except Exception: db.session.rollback() else: if not models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link) [0])).first() in list_person: list_person.append( models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first()) # if not film.description: while True: try: film.name = soup.find('span', { 'class': 'moviename-title-wrapper' }).text film.name_original = film.name if not soup.find( 'span', { 'class': 'alternativeHeadline' }).text else soup.find('span', { 'class': 'alternativeHeadline' }).text film.description = soup.find('div', { 'itemprop': 'description' }).text.replace(chr(151), '-') film.rating_kp = float( soup.find('span', { 'class': 'rating_ball' }).text) film.rating_imdb = float( re.findall( r'[\d][^ ]+', soup.find('div', { 'style': 'color:#999;font:100 11px tahoma, verdana' }).text)[0]) film.date_released = int( soup.find('div', { 'style': 'position: relative' }).find('a').text) try: db.session.commit() except Exception: db.session.rollback() continue film.genre.clear() film.country.clear() film.person.clear() while True: for i in list_genres: film.genre.append(i) for i in list_countrys: film.country.append(i) for i in list_person: film.person.append(i) db.session.add(film) try: db.session.commit() break except Exception: db.session.rollback() break except: db.session.rollback()
def score(): try: r = ProxyRequests(config.ur) r.get() a = str(r) data = json.loads(a) score = int(data["comm_lines"][0]["score"]) wicket = int(data["comm_lines"][0]["wkts"]) over = float(data['bat_team']['innings'][0]['overs']) detailed_score = config.bat_team_name + " " + data["comm_lines"][0][ "score"] + "/" + data["comm_lines"][0]["wkts"] + " " + data[ 'bat_team']['innings'][0]['overs'] print(detailed_score, end=" ") try: bowler = data['bowler'][0]['name'] print("B:" + bowler) batname0 = data['batsman'][0]['name'] batname1 = data['batsman'][1]['name'] bat0score = data['batsman'][0]['r'] bat1score = data['batsman'][1]['r'] bat0ball = data['batsman'][0]['b'] bat1ball = data['batsman'][1]['b'] bowler = data['bowler'][0]['name'] batters = batname0 + "*(" + bat0score + "-" + bat0ball + ") " + batname1 + "(" + bat1score + "-" + bat1ball + ")" print(batters) fputOnRdb(detailed_score + " B: " + bowler + "\n" + batters + "\nRecent:\n" + data['prev_overs']) except: print("An exception occurred fetching either batters or bowler") try: if (over == (config.tover - 1.0 + 0.5)): global bow bow = bowler if over == config.tover: prev_overs = data['prev_overs'] prev_over = prev_overs.split('|') msg = detailed_score + " B:" + bow + "\n" + batters + "\n" + prev_over[ -1] print(msg) notify(msg) config.tover = config.tover + 1 fbpush(msg) updateRegIds() time.sleep(10) if wicket == config.twicket: msg = "wicket " + str( config.twicket ) + " " + data['last_wkt_name'] + " " + data[ 'last_wkt_score'] + " B: " + bowler + "\n" + detailed_score fbpush(msg) notify(msg) config.twicket = config.twicket + 1 time.sleep(15) if (int(over + 1) != config.tover): updateRegIds() prefetch() if ((wicket + 1) != config.twicket): updateRegIds() prefetch() except: print("An exception occurred while trying to notify") except: print("An exception occurred fetching score")
def rotate_proxy(test_url=BASE_URL + '/version'): rotator = ProxyRequests(test_url) rotator.get() proxy = rotator.get_proxy_used proxies = {'http': 'http://%s' % proxy, 'https': 'https://%s' % proxy}
from proxy_requests import ProxyRequests from proxyValidator import ProxyValidator # proxyInstance = ProxyValidator(['207.154.231.217:3128']) # print(proxyInstance.validated_proxies) r = ProxyRequests("https://api.ipify.org") print(r.get())
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.14.1) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14' ] user_agent = random.choice(user_agent_list) #proxy_list = get_proxies() #proxy = random.choice(proxy_list) #request = urllib.request.Request(url,headers={'User-Agent': user_agent}) #response = urllib.request.urlopen(request) #response = requests.get(url,proxies={"http": proxy, "https": proxy},headers={'User-Agent': user_agent}) r = ProxyRequests(url) r.get() html = str(r) #response.content soup = BeautifulSoup(html, 'html.parser') print(soup.contents[36].table.tr.next_sibling.next_sibling.next_sibling. next_sibling.next_sibling.next_sibling.next_sibling.next_sibling. next_sibling.next_sibling.next_sibling.next_sibling.next_sibling. next_sibling.next_sibling.next_sibling.next_sibling.next_sibling. next_sibling.next_sibling.td.next_sibling.next_sibling.next_sibling. next_sibling.next_sibling.next_sibling.next_sibling.b.string) #/html/body/table[3]/tbody/tr[1]/td/table/tbody/tr[7]/td/table/tbody/tr[11]/td[6]/b