def fetch_via_proxy(url): import time from http_request_randomizer.requests.proxy.requestProxy import RequestProxy req_proxy = RequestProxy() request = req_proxy.generate_proxied_request(url) if request is not None: time.sleep(5) return request
def req_split(r): start = time.time() req_proxy = RequestProxy() #print("Initialization took: {0} sec".format((time.time() - start))) #print("Size: {0}".format(len(req_proxy.get_proxy_list()))) #print("ALL = {0} ".format(list(map(lambda x: x.get_address(), req_proxy.get_proxy_list())))) request = req_proxy.generate_proxied_request(url) if request is not None: print("\t Response: ip={0}".format(u''.join(request.text).encode('utf-8'))) print("-> Going to sleep..")
def getInfo(url, artist, album): # Get the album page # An alternate way of doing things- use the rym search engine # url = https://rateyourmusic.com/search?searchterm=ARTIST+ALBUM&type=l #search the page for the artist and album, then go to that page url = url + artist + "/" + album + "/" try: req_proxy = RequestProxy() page = req_proxy.generate_proxied_request( url ) #PROBLEM, gets detected after a few different requests, unless you manually make it wait for multiple minutes between requests soup = BeautifulSoup(page.content, 'html.parser') except UnicodeDecodeError: print 'UnicodeDecodeError! Skipping...' return # Get genres from page genre_text = str(soup.findAll("span", {"class": "release_pri_genres"})) # Get secondary genres from page sec_genre_text = str(soup.findAll("span", {"class": "release_sec_genres"})) # Clean up and compile all genres unclean_genres = re.findall(r']">.*?</a>', genre_text) unclean_sec_genres = re.findall(r']">.*?</a>', sec_genre_text) genres = [] for genre in unclean_genres: genre = genre[3:-4] genres.append(genre) for genre in unclean_sec_genres: genre = genre[3:-4] genres.append(genre) # Get descriptors from page descriptor_text = str( soup.findAll("span", {"class": "release_pri_descriptors"})) descriptor_text = descriptor_text[37:-7] # Clean up and organize each descriptor unclean_descriptors = re.findall(r'.*?,', descriptor_text) descriptors = [] for descriptor in unclean_descriptors: descriptor = descriptor[2:-1] descriptors.append(descriptor) # Print genres genres = ';'.join(genre for genre in genres) print(artist + '->' + album + ' genres:'), print genres # Print descriptors descriptors = '; '.join(descriptor.title() for descriptor in descriptors) print(artist + '->' + album + ' descriptors:') print descriptors return genres, descriptors
class RequestMaker: def __init__(self): self.req_proxy = RequestProxy() def _generate_proxied_request(self, url, params=None): if params is None: params = {} for _ in range(0, len(self.req_proxy.get_proxy_list())): proxy_response = self.req_proxy.generate_proxied_request( url, params=params) if proxy_response is not None: return proxy_response return None def get(self, url, params=None): proxy_response = self._generate_proxied_request(url, params) if proxy_response is None: raise RuntimeError( 'Failed to generate proxied request for {}'.format(url)) return proxy_response
# -*- coding: utf-8 -* #!/usr/bin/python #pip install http-request-randomizer ##################################### ##KILL THE NET## ##############[LIBS]################### from http_request_randomizer.requests.proxy.requestProxy import RequestProxy import sys while 1: try: ip = sys.argv[1] api = 'http://api.hackertarget.com/reverseiplookup/?q='+ip req_proxy = RequestProxy() try: request = req_proxy.generate_proxied_request(api) if request: if 'error' in request.text or 'No DNS' in request.text: break if 'API count exceeded' in request.text or 'Bad Request' in request.text: continue else: open(ip+'.txt','a').write(request.text+'\n') open('ALL-SITES.txt','a').write(request.text+'\n') break except: pass except Exception as e: print(e) break
class YFHistoricalDataExtract(object): """ Function for grabbing historical stock data from yahoo finance. Utilizes the HTTP_Request_Randomizer library to make proxied function calls so as to avoid IPbans from relevant sources. <More Info Here!!!> """ def __init__(self, stock_file, data_storage_dir="./historical_stock_data", threads=10, clear_existing=True): """ Initializes the proxy server as well as directories that all of the read in historical data will be stored to. Note: The directory structure could already exist and the data could already be there. It does not always make sense to delete the old data and start again. If the clear_existing variable is set, clear the existing directories. The default is to clear the existing directories containing historical data and start over. """ self.proxy_server = RequestProxy() self.output_dir = data_storage_dir self.ticker_file = stock_file self.thread_limit = threads # If the user asks for it, clear the existing directory structure if clear_existing is True: self.clear_directories() # Check to see if the file containing ticker symbols exists if not os.path.exists(stock_file): raise BadTickerFile() # Try to make the directory structure that the data will be stored in self.setup_directories() try: os.makedirs("%s/dividends" % self.output_dir) except OSError: print "[Error]: Could not create directory structure." raise CannotCreateDirectory() def clear_directories(self): """ Wipe the existing directory structure if it exists. """ os.system("rm -rf %s" % self.output_dir) def setup_directories(self): if not os.path.exists(self.output_dir): try: os.makedirs(self.output_dir) except OSError as e: print "[ERROR]: %s" % str(e) raise CannotCreateDirectory() if not os.path.exists(self.output_dir + "/dividend_history"): try: os.makedirs(self.output_dir + "/dividend_history") except OsError as e: print "[ERROR]: %s" % str(e) raise CannotCreateDirectory() def get_historical_data(self): stock_file = open(self.ticker_file, "r") candidates_to_test = [] pool = ThreadPool(self.thread_limit) for ticker in stock_file.readlines(): candidates_to_test.append(ticker.strip()) pool.map(self.read_ticker_historical, candidates_to_test) def read_ticker_historical(self, ticker_symbol): URL = "https://finance.yahoo.com/quote/%s/history/" % ticker_symbol response = None # Loop until you get a valid response while True: try: response = self.proxy_server.generate_proxied_request( URL, req_timeout=5) except Exception as e: print "Exception: %s %s" % (ticker_symbol, str(e)) return if response is None: continue if response.__dict__['status_code'] == 200: break response_soup = BeautifulSoup(response.text, 'html5lib') # Find all rows in the historical data. response_soup = response_soup.find_all("tr") response_soup = response_soup[2:] json_history_file = open( "%s/%s.json" % (self.output_dir, ticker_symbol), "w") json_dividend_file = open( "%s/%s_dividend.json" % (self.output_dir + "/dividend_history", ticker_symbol), "w") historical_data = { 'Date': [], 'Open': [], 'High': [], 'Low': [], 'Close': [], 'Adj Close': [], 'Volume': [] } dividend_data = {'Date': [], 'Amount': []} for response in response_soup: filtered_response = response.find_all("td") if len(filtered_response) == 7: # Date historical_data["Date"].append(filtered_response[0].text) # Open historical_data["Open"].append(filtered_response[1].text) # High historical_data["High"].append(filtered_response[2].text) # Low historical_data["Low"].append(filtered_response[3].text) # Close historical_data["Close"].append(filtered_response[4].text) # Adj Close historical_data["Adj Close"].append(filtered_response[5].text) elif len(filtered_response) == 2: # Date dividend_data["Date"].append(filtered_response[0].text) # Dividend Amount amount = filtered_response[1].text.replace(" Dividend", "") dividend_data["Amount"].append(amount) else: continue json_history_file.write(json.dumps(historical_data)) json_dividend_file.write(json.dumps(dividend_data)) json_history_file.close() json_dividend_file.close()
def get_search_links(self, prodname): url = 'https://www.google.fr/search?q=reference%20"' + "%20".join( prodname.split(" ")) + '"' urllist = [] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } try: page = requests.get(url, headers=headers) soup = BeautifulSoup(page.content) for link in soup.find_all("a"): if (link.has_attr('href')): if ("https://" in link['href'] and "webcache" not in link['href'] and "google." not in link['href'] and "youtube." not in link['href']): templink = link['href'].split("&")[0] if ("https:" in templink): urllist.append("http" + templink.split("http")[1]) if (len(urllist) == 0): itr = 0 while itr < 5: try: req_proxy = RequestProxy() request = req_proxy.generate_proxied_request(url) if (request.status_code == 200 and request is not None): soup = BeautifulSoup(request.content) for link in soup.find_all("a"): if (link.has_attr('href')): if ("https://" in link['href'] and "webcache" not in link['href'] and "google." not in link['href'] and "youtube." not in link['href']): templink = link['href'].split("&")[0] if ("https:" in templink): urllist.append( "http" + templink.split("http")[1]) if (len(urllist) > 0): itr = 6 break else: itr = itr + 1 except: itr = itr + 1 continue if (len(urllist) == 0): urllist = list( search(query="%20".join(prodname.split(" ")), tld="fr", lang="fr", num=10, start=1, stop=20)) self.logger.info("Number of sites found:" + str(len(urllist))) except Exception as e: self.logger.info("Error:" + str(e)) self.logger.info("Failed prod:" + prodname) return (urllist)
text = (codeOpen + artistName + spaceInput + hyphenInput + spaceInput + songName + '(' + mixName + ' Remix' + ')' + codeClose) newName = ('/storage/emulated/0/temp/' + artistName + hyphenInput + songName + hyphenInput + mixName + '.mp3') print('\nWorking on Request: ' + query) # baseURL url = 'https://mp3cc.biz/search/f/' + query + '/' # proxy_headersRequest req_proxy = RequestProxy() while not req_proxy.generate_proxied_request(url): print('\nNext proxy for "Base URL"') else: print('\nConnected to "Base URL!"') pass # saveToFile with open('parse.txt', 'wb') as f: response = requests.get(url) f.write(response.content) # parseFromFile with open('parse.txt', 'r', encoding='UTF-8') as p: s = BeautifulSoup(p, 'html.parser')
def read_one_pg(pageno): url = URL.format(pageno=pageno) req_proxy = RequestProxy() return req_proxy.generate_proxied_request(url)
import time from http_request_randomizer.requests.proxy.requestProxy import RequestProxy if __name__ == '__main__': start = time.time() req_proxy = RequestProxy() print("Initialization took: {0} sec".format((time.time() - start))) print("Size: {0}".format(len(req_proxy.get_proxy_list()))) print("ALL = {0} ".format(list(map(lambda x: x.get_address(), req_proxy.get_proxy_list())))) test_url = 'http://ipv4.icanhazip.com' while True: start = time.time() request = req_proxy.generate_proxied_request(test_url) print("Proxied Request Took: {0} sec => Status: {1}".format((time.time() - start), request.__str__())) if request is not None: print("\t Response: ip={0}".format(u''.join(request.text).encode('utf-8'))) print("Proxy List Size: {0}".format(len(req_proxy.get_proxy_list()))) print("-> Going to sleep..") time.sleep(10)
i = (i - 1) % (len(SYMBOL)) file.close() file = open(FILE_NAME, 'a', newline='', encoding='utf-8') csvfile = csv.DictWriter(file, FIELDS) req_proxy = RequestProxy() stocktwit_url = "https://api.stocktwits.com/api/2/streams/symbol/" + SYMBOL[ token] + ".json?" + access_token[token] if last_message_id[token] is not None: stocktwit_url += "max=" + str(last_message_id[token]) api_hits = 0 while True: response = req_proxy.generate_proxied_request(stocktwit_url) if response is not None: if response.status_code == 429: print("###############") print("REQUEST IP RATE LIMITED FOR {} seconds !!!".format( int(response.headers['X-RateLimit-Reset']) - int(time.time()))) if not response.status_code == 200: token = (token + 1) % (len(access_token)) stocktwit_url = "https://api.stocktwits.com/api/2/streams/symbol/" + SYMBOL[token] + ".json?" + \ access_token[token] + "max=" + str( last_message_id[token]) continue