def get_coords_string_from_url(input_string): response = get_request(input_string) response.raise_for_status() coordinate_strings = COORDS_FROM_URL_REGEX.search(response.text) if coordinate_strings is None: raise ValueError return coordinate_strings.group()
def get_constituency_results(page_url, year, parties=None): """ This method is the main constituency results scraper. The constituency page url and election year must be provided. A list of dictionaries containing candiate names, parties, and vote tallies is returned. """ # Get page as soup request = get_request(page_url) request.raise_for_status() soup = Soup(request.text, 'html.parser') # Find results table election_table = False tables = soup.find_all('caption') for table in tables: if re.match('General [Ee]lection {}'.format(year), table.find('a').contents[0]): election_table = table.parent break # Try alternative scraper if not election_table: return alternative_constituency_results(soup, year, parties) # Process results table candidates = [] for candidate in election_table.findChildren('tr', class_='vcard'): # Add candidate to list candidates.append(get_candidate_from_row(candidate, 3, parties)) # Return candidates return candidates
def _get_largest_timeoverlap_subfolder(url, expected_folder, offset=9): # gets best matching subfolder based on differences of start- and endtimes # offset determines the start of a %H%M_%H%M pattern that determines start and endtime # default (9) is for standard pattern yyyymmdd_HHMM_HHMM f = get_request(url) soup = BeautifulSoup(f.text, 'html.parser') best_match = '' best_overlap = 0 exp_start = datetime.strptime(expected_folder[offset:offset + 4], "%H%M") exp_end = datetime.strptime(expected_folder[offset + 5:offset + 9], "%H%M") for a in soup.find_all('a'): folder = a.get("href") if folder[:offset] != expected_folder[:offset]: continue start_max = max(exp_start, datetime.strptime(folder[offset:offset + 4], "%H%M")) end_min = min(exp_end, datetime.strptime(folder[offset + 5:offset + 9], "%H%M")) overlap = (end_min - start_max).total_seconds() if overlap > best_overlap and overlap > 0: best_match = folder best_overlap = overlap return best_match
def get_wca_id_from_access_token(access_token): """ Returns the user's WCA ID from the /me WCA API endpoint. """ headers = {"Authorization": "Bearer " + access_token} me_data = get_request(__WCA_ME_API_URL, headers=headers).json() return me_data['me']['wca_id']
def crawl_robots(bfs, url, header_dict, counter, forms, keywords, seen): # If url doesnt end with /robots.txt or robots.txt/ then append and call, otherwise call just url robots_url = '' if url.endswith('robots.txt') or url.endswith('robots.txt/'): robots_url = url else: if url.endswith('/'): robots_url = url + 'robots.txt' else: robots_url = url + '/robots.txt' try: print('\nTrying robots link: {}'.format(robots_url)) textfile = get_request(robots_url, {}) except socket.gaierror: print('\nCould not find {}'.format(robots_url)) return split_text = textfile.split('\n') allow_disallow = list( filter(lambda x: x.startswith('Disallow:') or x.startswith('Allow:'), split_text)) new_links = list(map(lambda x: x.split(' ')[1], allow_disallow)) appended_links = list(map(lambda x: '{}{}'.format(url, x), new_links)) for link in appended_links: if link not in seen: try: if bfs: crawl_bfs(link, header_dict, counter, forms, keywords, seen) else: crawl_dfs(link, header_dict, counter, forms, keywords, seen) except socket.gaierror: print('\nCould not find {}'.format(link))
def validate(self, paper_id): url_prefix = self.ARXIV_VALIDATE_URL_PREFIX if not paper_id.startswith(url_prefix): paper_ref = urljoin(url_prefix, paper_id) else: paper_ref = paper_id r = get_request(paper_ref) return (r.status_code == 200)
def brute_force(user, keywords, forms, user_agent): passwords = generate_all_passwords(keywords) results = {} for form in forms: print('\nAttempting to brute-force: ' + form + '\n') get_header = {'User-Agent': user_agent} html_doc = get_request(form, get_header) parser = HTMLParser(html_doc) done = False for password in passwords: if not done: login = parser.create_login_string(user, password) post = { 'User-Agent': user_agent, 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': str(len(login)) } response = post_request(form, post, login) if get_status(response) >= 500: print( '\nHold on, too many failed attempts! Waiting for the server to accept more login requests...\n' ) # Continually retry logging in if there is a server error (too many failed attempts) while get_status(response) >= 500: response = post_request(form, post, login) combination = 'User: '******'\nPassword: '******'Attempting to login...\n' + combination) if get_status(response) == 302: print('Login Succeeded!\n') done = True results[form] = combination else: print('Login Failed...\n') if not done: print('Ran out of passwords! Bruteforce failed!') results[form] = None sleep( 5 ) # Temporary pause between forms to see end result of the current form # Print bruteforce results print('Bruteforcer Results') print('-' * 50) for form, combination in results.items(): print('Form: ' + form) if combination is None: print('Bruteforce Failed') else: print(combination) print()
def get_sunrise_sunset_times(): data = get_request( 'https://api.sunrise-sunset.org/json?lat=37.983810&lng=23.727539&date=today&formatted=0' ).json()['results'] return { 'sunset': parse_utc_time_string(data['sunset']), 'sunrise': parse_utc_time_string(data['sunrise']) }
def _get(self, path, data={}, key=None, admin=False): if not key: key = self.api_key if not admin else self.master_api_key data = data.copy() data['key'] = key if path.startswith("/api"): path = path[len("/api"):] url = "%s/%s" % (self.api_url, path) return get_request(url, params=data)
def _get( self, path, data={}, key=None, admin=False ): if not key: key = self.api_key if not admin else self.master_api_key data = data.copy() data['key'] = key if path.startswith("/api"): path = path[ len("/api"): ] url = "%s/%s" % (self.api_url, path) return get_request( url, params=data )
async def get_fresh_data(repository_url: str, excluded: Iterable) -> AsyncIterator[str]: """ Retrieve a fresh batch of data from the repository. Parameters ---------- repository_url: str URL for the repository (the zip file). excluded: Iterable Returns ------- AsyncIterator[str] An async iterator of relative paths to the file. """ url = BASE_REPOSITORY + repository_url.lstrip( processor_settings.URL_SEPARATOR) # Requesting the latest files from the repository. async with Lock(): response = get_request(url=url) logging.info(f"> Download request completed with " f"status {response.status_code}: {repository_url}") if response.status_code != HTTPStatus.OK: raise RuntimeError( f"Failed to download the data from {url}: {response.text}") # `ZipFile` only understands files. data_bin = BytesIO(response.content) async with Lock(): with ZipFile(data_bin, mode="r") as zip_obj: paths = zip_obj.namelist() # Extracting the contents into the temp directory. zip_obj.extractall(TEMP_DIR_PATH) logging.info("> Successfully extracted and stored the data") for path in paths: _, filename = split_path(path) if any(map(lambda p: p in path, excluded)): continue full_path = join_path(TEMP_DIR_PATH, path) # Discard directories if not isfile(full_path): continue logging.info(f"> Processing file '{path}'") yield path
def new(cls, url, timeout=5): response = get_request(url, timeout=timeout) response.raise_for_status() text = response.json() if not cls.is_unique(text): return cls.new(url, timeout) # someday jokes will end return cls(text=text, user=current_user)
def get_data(): execution_date = datetime.today().strftime('%Y-%m-%d') execution_date_minus_30 = (datetime.strptime(execution_date, '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d') dates = [(datetime.strptime(execution_date, '%Y-%m-%d') - timedelta(days=day)).strftime('%Y-%m-%d') for day in range(30)] response: Response = get_request( f'{BASE_URL}/{HISTORY_ENDPOINT}?start_at={execution_date_minus_30}&end_at={execution_date}' ) rates: Dict = response.json().get('rates') return rates
def recovery_flux_url(self,url): """ Arguments: url : string containing the url of the rss feed Return : BeautifulSoup """ req = get_request(url) data = req.text soup = BeautifulSoup(data, "lxml") return(soup)
def fetch_raw_data(url: str, encoding: str = 'windows-1251') -> Any: headers = default_headers() headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' }) req = get_request(url, headers) req.encoding = encoding return req
def main(): audioData = open("audio.txt", "w+") monitor = PeakMonitor(SINK_NAME, METER_RATE) for sample in monitor: sample = sample >> DISPLAY_SCALE bar = '>' * sample spaces = ' ' * (MAX_SPACES - sample) #print ' %3d %s%s\r' % (sample, bar, spaces), sys.stdout.flush() spectrum = [0] * 6 for i in range(6): try: spectrum[i] = (monitor._samples.get(0)) except: pass rgb = int((float(sum(spectrum))/400.0) * 100) rgb = ('#%02x%02x%02x' % (0,0,rgb))[1:] print rgb get_request('http://192.168.1.13:8081/' + rgb)
def _get_html_from_url(url: str) -> str: response_body = None try: response: Response = get_request(url) if 200 == response.status_code and str.startswith(response.headers.get('content-type'), 'text/html'): if type(response.content) is bytes: response_body = response.content.decode('utf-8') else: response_body = response.content except RequestException: pass return response_body
def extractor(channel_id): extractor.proxies_tmp_list = [] extractor.proxies = {""} html = get_request("https://t.me/s/" + channel_id).content soup = BeautifulSoup(html) links = soup.find_all("a") for tag in links: proxy = tag.get("href", None) if (proxy != None) and ("/proxy?" and "&secret=" and "&port=" and "server=" in proxy): extractor.proxies.add(proxy) extractor.proxies.remove("")
def get(self, filename: str, offset: int = -1, maxlen: int = -1, headers: dict = None, cookies: dict = None): if not headers: headers = {} if not cookies: cookies = () response = get_request(filename, headers=headers, cookies=cookies) ret = response.text if offset > 0: ret = ret[offset:] if maxlen > 0: ret = ret[:maxlen] return ret
def _get_best_matching_subfolder(url, expected_folder, filter_func=lambda x: True): f = get_request(url) soup = BeautifulSoup(f.text, 'html.parser') best_match = '' best_score = 0 for a in soup.find_all('a'): folder = a.get("href") if not filter_func(folder): continue score = _matching_chars(folder, expected_folder) if score > best_score: best_match = folder best_score = score return best_match
def read_limb_flares(flare_list=DEFAULT_LIMB_FLARE_LOCATION): # load data if os.path.isfile(flare_list): with open(flare_list) as tsv: lines = tsv.read().split("\n") else: lines = get_request(flare_list).text.split("\n") flares = [] for line in lines: cell = line.split( "\t" ) # 1, 2002 Mar 07, 17:50:44, C2.5, -961.5, -176.4, 21.4, 11.6, 5.86, 32.0, 10.1, 4.56, -0.4, 0.77, 12, if len(cell) > 2 and cell[0].isnumeric() and len(cell[1]) > 4: flares.append( datetime.strptime(cell[1] + "T" + cell[2], "%Y %b %dT%H:%M:%S")) return flares
def query_osm_results(query_box={'s': 0, 'n': 0, 'w': 0, 'e': 0}, filter_tag_or_tagval=''): ''' Query OSM 'relation' data elements in a given bounding box. 'relation' elements are used to organize multiple nodes or ways into a larger whole. Generates an OSM http request and return response in JSON format. i.e. http://overpass-api.de/api/interpreter?data=[out:json];node(41.5,-122.0,41.7,-121.6);%3C;out%20meta; ''' query = "node({s},{w},{n},{e});<;out meta;".format(s=query_box['s'], n=query_box['n'], w=query_box['w'], e=query_box['e']) url_base = "http://overpass-api.de/api/interpreter?data=[out:json];" ''' This call includes: - all nodes in the bounding box, - all ways that have such a node as member, - and all relations that have such a node or such a way as members. ''' url = url_base + query try: sleep(0.5) req_start_time = time() osm_data = get_request(url).json() log.info(' Request took %.2f seconds' % (time() - req_start_time)) if 'elements' in osm_data: if filter_tag_or_tagval != '': filtered_set = list() for itm in osm_data['elements']: if 'tags' in itm: if filter_tag_or_tagval in itm['tags']: filtered_set.append(itm) else: for itag in itm['tags']: if itm['tags'][itag] == filter_tag_or_tagval: filtered_set.append(itm) return filtered_set return osm_data['elements'] else: log.warning('no elements found in query_box ' + str(query_box)) return [] except ValueError as e: log.warning('ValueError for query_box ' + str(query_box) + " | " + str(e)) return []
def process_node(start_url, node, header_dict, forms, keywords, seen, add_next, counter): if counter.count >= counter.page_max: return # Make GET request to the current URL html_doc = get_request(node.url, header_dict) if html_doc is not None: print('Depth ' + str(node.depth) + ': Processing: ' + node.url) counter.count += 1 parser = HTMLParser(html_doc) # Extract and add words from current page to the set of keywords keywords |= parser.extract_words() # If a login form is found, add it to the set form_found = parser.detect_login_form() if form_found: forms.add(node.url) # Add reachable URLs from current node if its depth < max depth if node.depth < counter.max_depth: # Retrieve set of URLs reachable from current node linked_urls = parser.extract_urls() for url in linked_urls: # Reformat if relative url given if 'http' not in url: url = reformat_url(url, node.url) if url not in seen and url.startswith(start_url): seen.add(url) add_duplicate_url(url, seen) # Traversal dependent function to add next node add_next(url) else: print('4xx/5xx error at: ' + node.url)
def upload(): image_found = False if request.method == 'POST' and request.files['photo']: extension = get_image_extension(request.files['photo']) print("IMAGE_STUFF: ", request.files['photo']) if extension not in ALLOWED_EXTENSIONS: flash( f"This image extension (.{extension}) is not supported. Upload {' '.join(ALLOWED_EXTENSIONS)} only.", "error") print(f"ERROR in upload(): The image extension is not supported.") return render_template('index.html', user_image=False) image = Image.open(request.files['photo']) image_found = True elif request.method == 'POST' and request.form['text']: link = request.form['text'] extension = link.split('.')[-1].lower() if extension not in ALLOWED_EXTENSIONS: flash(f"Image URL must end with .png, .jpg or .jpeg", "error") print( f"ERROR in upload(): The image URL extension is not supported." ) return redirect('/') response = get_request(link) image = Image.open(BytesIO(response.content)) image_found = True user_image = None if image_found: user_image = detect_boxes(image) return render_template('index.html', user_image=user_image)
cli_parser.add_argument("--test", "-t", help="Send results to stdout", action="store_true") cli_parser.add_argument("--simple", "-s", help="Do not gather commute info", action="store_true") cli_args = cli_parser.parse_args() # sadly, https://ochdatabase.umd.edu/, doesn't have an API, but there is a degree of consistency to search queries and their matching URLs # the simplest way forward is to build a search manually and then copy/paste the URL below, as we have done url = "https://ochdatabase.umd.edu/housing/price-under+2100" page = get_request(url) soup = BeautifulSoup(page.content, "html.parser") search_results = soup.find(id="expo") postings = search_results.find_all( "article", class_=compile_regex(r"^ocp-property-search property-\d?.*")) parsed_posts = [] for post in postings: prop = collect_info(post, cli_args.test) if prop is not None: parsed_posts.append(prop) if not parsed_posts:
def resolve_rocket(self, info: ResolveInfo, rocketId): response = get_request('{}rockets/{}'.format(spacex_api_url, rocketId)) return response.json()
if fd not in self.client_data: self.client_data[fd] = '' if len(data) == 0: self.c_close(fd, 'empty string recv') return self.client_data[fd] += data response = '' if len(self.client_data[fd]) > 0: request = self.client_data[fd] request = request.split('\r\n') method = request[0].split(' ') host = requests.get_host(request[1]) if method[0] == 'GET': response = requests.get_request(method[1], self.hosts[host],self.media) elif method[0] not in self.methods: response = requests.bad_request() elif method[0] in self.methods: response = requests.not_implemented() else: return total_sent = 0 while total_sent < len(response): try: sent = self.clients[fd].send(response[total_sent:]) except socket.error, (value, message): if value == errno.EAGAIN or errno.EWOULDBLOCK: continue else: self.c_close(fd, 'send')
from requests import get as get_request from bs4 import BeautifulSoup as Soup import json import sys from UKVotingMethods.wiki_scraper import get_constituency_results # Get parties list with open('./data/parties.json', encoding='utf-8') as file: parties = json.load(file) # Get constituency list page request = get_request( 'https://en.wikipedia.org/wiki/Results_of_the_United_Kingdom_general_election,_2015_by_parliamentary_constituency' ) request.raise_for_status() soup = Soup(request.text, 'html.parser') # Loop over table rows constituencies = {} table = soup.find('table', class_='wikitable') for row in table.findChildren('tr'): # Skip header row if row.get('valign'): continue # Skip bottom rows if row.get('class'): continue
def latest_version(user: str, repo: str) -> str: from requests import get as get_request return get_request( f"https://api.github.com/repos/{user}/{repo}/releases/latest").json( )["tag_name"]
async def get_request(url): data = requests.get_request(url) return url
from keras.layers import Dense, Concatenate, Flatten from keras.models import Model from os.path import join,exists from os import makedirs from requests import get as get_request from src import logger if __name__ == '__main__': logger.info("Start process") logger.info("Load definition model") model = make_yolov3_model() weight_path = join('extras','yolov3.weights') if not exists(weight_path): makedirs('extras', exist_ok=True) url = 'https://pjreddie.com/media/files/yolov3.weights' r = get_request(url) with open(weight_path, 'wb') as f: f.write(r.content) del r logger.info("Load weights") weight_reader = WeightReader(weight_path) weight_reader.load_weights(model) logger.info("End load weights") logger.info("Add new layer for project purpose") new_outputs = [] up_sampling = 4 for output in model.outputs: if up_sampling > 1: up_sampled_layer = UpSampling2D(up_sampling)(output) else: up_sampled_layer = output
def resolve_all_launches(self, info: ResolveInfo): response = get_request('{}launches/'.format(spacex_api_url)) return response.json()