Ejemplos de get_html en Python, ejemplos de requesting_urls.get_html en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: andebraa/in4110

def extract_player_urls(url):
    """
    Finding all the players urls from a nba team page

    Args:
        url (string): 

    Returns:
        list: all player urls
    """

    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    team_name = soup.find("h1", attrs={"id": "firstHeading"}).text
    team_name = team_name[8:-7]  #Takes away 2019-20 and season from title
    soup_table = soup.find("table", attrs={"class": "toccolours"})
    soup_table_rows = soup_table.find_all("tr")[2].td.table.tbody.find_all(
        "tr")
    column_names = [
        row.find("td", attrs={"style": "text-align:left;"})
        for row in soup_table_rows
    ]
    column_names.remove(None)
    player_cells_html = [str(row.find("a")) for row in column_names]
    player_urls = filter_urls(html="\n".join(player_cells_html),
                              base_url="https://en.wikipedia.org")

    return player_urls

Ejemplo n.º 2

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: andebraa/in4110

def extract_url(url):
    """
    Extracting team names from all the teams in semifinals

    Args:
        url (string): 

    Returns:
        team_names: list - of all teams
        team_urls: list - of each teams url
    """
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    soup_table = soup.find(
        "table", attrs={"style": "font-size: 90%; margin:1em 2em 1em 1em;"})
    soup_table_rows = soup_table.find_all("tr")
    soup_table_rows = soup_table_rows[4:]
    soup_table_rows = soup_table_rows[::12] + soup_table_rows[2::12]
    team_cells_html = [str(row.find("a")) for row in soup_table_rows]
    team_names = [row.find("a").text for row in soup_table_rows]
    team_names = [
        name if len(name.split()) == 1 else name.split()[-1]
        for name in team_names
    ]
    team_urls = filter_urls(html="\n".join(team_cells_html),
                            base_url="https://en.wikipedia.org")
    return team_names, team_urls

Ejemplo n.º 3

0

Mostrar archivo

Archivo: time_planner.py Proyecto: andebraa/in4110

def extract_events(url):
    """
    Finding season table and making .md file   
 
    Args:
        url (string): 
    """

    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find('table', attrs={"class": 'wikitable'})
    new_table = table_to_2d(table)

    discipline = {
        'DH': 'Downhill',
        'SL': 'Slalom',
        'GS': 'Giant Slalom',
        'SG': 'Super Giant Slalom',
        'AC': 'Alpine Combined',
        'PG': 'Parallel Giant Slalom'
    }
    f = open('betting_slip_empty.md', 'w')

    f.write('*date*|*venue*|*discipline*|Who wins?*\n')
    f.write(':-----|:-----:|:-----:|-----:')
    f.write('\n')
    for row in new_table[1:-1]:
        try:
            f.write(
                f"{row[2].strip()}|{row[3].strip()}|{discipline[row[4][:2]]}\n"
            )
        except:
            pass
    f.close()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: fetch_playerstatistics.py Proyecto: j-hermansen/in4110

def team_players(team_url):
    """ Function gets all player on a team.

    :param team_url (list): Team info [name, relative url].

    :return: All players on team.
    """

    # Get html
    html_content = get_html('https://en.wikipedia.org{}'.format(team_url[1]))

    document = BeautifulSoup(
        html_content[1], 'html.parser'
    )  # Parse html content at index 1 (url on index 0) from get_html()
    player_table = document.find_all("table", class_='toccolours')[0].find(
        "table")  # Gets inner table containing players of 2019-20 team table

    rows = player_table.find_all("tr")

    players = []

    # Go through each row except first (th-tags)
    for row in rows[1:]:
        cells = row.find_all("td")
        player_name = cells[2].get_text(strip=True)
        player_url = cells[2].a['href']

        player = [player_name, player_url]

        players.append(player)

    return players

Ejemplo n.º 5

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: ZinderAsh/Uni-WebCrawling

def find_articles(url, output=None):
    """
    Finds all Wikipedia article links within a Wikipedia page.
    Args:
        url (string): The URL of the wikipedia page to fetch
        [output] (string): Optional filename to write urls to
    Returns:
        list: List of Wikipedia article URLs
    """
    html = get_html(url)
    urls = find_urls(html, url)
    # Article test, URL must either be relative or have a wikipedia URL
    is_wiki_url = re.compile('(?:^|wikipedia.org)/wiki/')
    # Check if the URL is a namespace (contains a colon)
    is_namespace = re.compile('https://[^:]*:')
    # Create a list containing only URLs that passes the article test and is not a namespace
    article_urls = [
        url for url in urls
        if is_wiki_url.search(url) and not is_namespace.search(url)
    ]

    if output:
        write_to_file(output, article_urls)

    return article_urls

Ejemplo n.º 6

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: andebraa/in4110

def filter_urls(html, base_url, output = False):
    """
    uses regex to identify valid urls in HTML code
    args:
        html (file): file with HTML code
        base_url (string, optional): url from which html argument is fetched
        output (string, optional): name of output file urls are written to
    returns:
        urls (list): List of valid urls in string format
    """
    if html == None and base_url!=None: #only website argument passed.
        try:
            html =get_html(base_url)
        except TypeError:
            print("could not fetch html from base_url argument.")


    #regular_links = re.findall(r"<a\s+href=\"(https:\/\/.\w+.\w+.\w{2,3}[\/\w+]*)", html) Does not work
    regular_links = re.findall("(?=<a).*href=\"([h|\/]{1}[^\"#]*)", html)
    for ind, url in enumerate(regular_links):
        if url[0] == '/' and url[1] != '/':
            regular_links[ind] = base_url + url
        elif url[0] == '/':
            regular_links[ind] = "https:" + url

    if output:
        #write url string to file
        f = open(output, 'w')
        for elem in regular_links:
            f.write(f"{elem}\n")
        f.close()
    return regular_links

Ejemplo n.º 7

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: ZinderAsh/Uni-WebCrawling

def get_players(team_url, team_name, limit=None):
    """
    Gets a list of Players for an NBA team, and each of their stats
    Args:
        team_url (string): The url for the team's wikipedia page
        team_name (string): The name of the team. This is assigned to each Player
        [limit] (int): Limits the list of Players to the top [limit] players by points per game
    Returns:
        list: A list of Players for the team
    """
    # Find the table for the Roster
    html = get_html("https://en.wikipedia.org" + team_url)
    soup = BeautifulSoup(html, "html.parser")
    soup_header = soup.find("span", {"id": "Roster"})
    soup_table = soup_header.findNext("tbody").findNext("tbody")

    players = []
    for row in soup_table.findAll("tr"):
        cells = row.findAll("td")
        # Row must contain a minimum of 2 cells to have a player name
        if len(cells) > 2:
            a = cells[2].find("a")
            # Create a Player-object with the name and url of the player
            player = Player(a["title"], team_name, a["href"])
            get_player_stats(player)
            players.append(player)
    # Limit the list to top [limit] players by points per game if the argument is given
    if limit:
        players.sort(reverse=True)
        players = players[:limit]
    return players

Ejemplo n.º 8

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: nicolossus/IN4110

def find_articles(url, params=None, output=None, **kwargs):
    """
    Find wikipedia article URL's linked in Wikipedia webpage.

    Request to fetch data from Wikipedia website of choice, find all
    anchored hyperlinks and filter out wikipedia article URL's only.

    Arguments
    ---------
    url : str
        URL of (Wikipedia) webpage to get
    params : dict, optional, default None
        Data to send in the URL's query string
    output : str, optional, default None
        Optional output filename to store the found article URL's. Filename can
        be specified without extension as .txt will be added regardless whether
        specified or not.
    **kwargs
        Arbitrary keyword arguments are passed along to requests.get
        in get_html()

    Returns
    -------
    matches : list
        Found URL's

    Raises
    ------
    TypeError : if 'output' is not str
    """
    # response object
    r = get_html(url, params=params, **kwargs)

    # define regex pattern for base URL
    base_url_pattern = r'^.+?[^\/:](?=[?\/]|$)'
    # extract base URL from URL
    base_url = re.search(base_url_pattern, url).group(0)

    # find all anchored hyperlinks from raw HTML
    all_urls_list = find_urls(r.text, base_url=base_url)

    # define regex pattern for wikipedia articles only
    article_pattern = re.compile(r'^https?://(?!.*:[^_]).*wiki.*')
    # search for pattern match in list
    matches = list(filter(article_pattern.match, all_urls_list))

    if output is not None:
        if not isinstance(output, str):
            raise TypeError("'output' must be str")

        filename, file_extension = os.path.splitext(output)
        if file_extension != ".txt":
            file_extension = ".txt"

        filename = filename + file_extension
        with open(filename, "w") as f:
            for line in matches:
                f.write(line + "\n")

    return matches

Ejemplo n.º 9

0

Mostrar archivo

Archivo: wiki_race_challenge.py Proyecto: ZinderAsh/Uni-WebCrawling

def wiki_thread(goal,
                article,
                queue,
                visited,
                sub_articles,
                keywords,
                sleeptime=0.01):
    """
    Checks all sub articles from links in an article, grants them scores, and puts them in the queue
    Args:
        goal (string): The article it is trying to find a path to
        article (Article): The article the links were fetched from
        queue (ArticleList): The queue to insert sub articles into
        visited (ArticleList): The list of previusly visited articles
        sub_articles (ArticleList): The list of articles to be checked
        keywords (list): List of keywords to score articles by
        sleeptime (float): Time to sleep between each article
    """
    title_exp = re.compile("<title>(.+) -.+</title>")

    while True:
        time.sleep(sleeptime)  # Slight delay to avoid denied responses
        l = sub_articles.next_article()
        # Stop when article list is empty
        if not l:
            break
        # Only search if link has not been visited already
        if not visited.contains(l):
            visited.append(l)  # Add to visited list
            if l == goal:
                # The correct link was found!
                print("Done!")
                sub_articles.clear()
                return Article(l, article).path
            elif "en.w" in l and "/Main_Page" not in l:
                # Only check english wiki links and do not go to the Main Page. No cheating!

                # Tries to get the html repeatedly until the request is accepted.
                # In case a request is denied.
                html = None
                while not html:
                    try:
                        html = get_html(l)
                    except:
                        pass

                content = html.lower()
                item = Article(l, article)
                # Check for keywords in the HTML, to grant the Article a score
                for i in range(len(keywords)):
                    for k in keywords[i]:
                        if k in content:
                            # Grant points if keywords are in the HTML
                            item.score += [1, 10, 50][i]
                        if k.replace(" ", "_") in l.lower():
                            # Grant more points if keywords are in the URL
                            item.score += [5, 30, 100][i]
                # Insert Article into search queue in accordance to its score
                inserted = queue.insert(item)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: fetch_playerstatistics.py Proyecto: j-hermansen/in4110

def get_player_points(player_url):
    """ Function that gets player stats for Rebounds Per Game, Blocks Per Game, and Points Per Game, for last season.

    :param player_url (list): Player info [name, relative url].

    :return: Player stats (rpg, bpg, and ppg) for last season.
    """

    # Get html
    html_content = get_html('https://en.wikipedia.org{}'.format(player_url[1]))

    # Get table of NBA regular season
    document = BeautifulSoup(html_content[1], 'html.parser')
    regular_season_section = document.find(
        'span', id='Regular_season')  # Find section for 'Regular season'
    if regular_season_section is None:  # Some player sites do not include span with id 'Regular_season'
        regular_season_section = document.find(
            'span', id='NBA')  # Instead, find section for 'NBA'

        if regular_season_section is None:  # Return if no section for a season in NBA was found
            return [0, 0, 0]  # Return zero list

    table = regular_season_section.find_next(
        'table'
    ).tbody  # Get the tbody section of table, where the player seasons are

    rows = table.find_all("tr")

    last_season = ''

    # Go through each season
    for row in rows:

        cells = row.find_all(["td", "th"])
        cells_text = [cell.get_text(strip=True) for cell in cells]

        # Get year 2019-20
        if re.match(
                '2019[^\w]20', str(cells_text[0])
        ):  # No match with regular '-' symbol in 2019-20, so checked with regex
            last_season = cells_text  # Save season 2019-20
            break

    # Get players rpg, bpg, and ppg, for last season (2019-20), or set to zeros if no score are presented.
    if last_season != '':
        rpg = last_season[8]
        bpg = last_season[11]
        ppg = last_season[12]
        player_stats = [rpg, bpg, ppg]
    else:
        player_stats = [0, 0, 0]

    return player_stats

Ejemplo n.º 11

0

Mostrar archivo

def _get_tables(url):
    """Gets the 'wikitable plainrowheaders'-tables from URL.

    Args:
        url (str): The URL to parse for 'wikitable plainrowheaders'-tables.

    Returns:
        bs4.element.ResultSet: The tables found in the URL.
    """
    html = get_html(url).text
    soup = BeautifulSoup(html, "html.parser")

    return soup.find_all("table", class_="wikitable plainrowheaders")

Ejemplo n.º 12

0

Mostrar archivo

Archivo: collect_dates.py Proyecto: alexpresthus/web-scraping-python

def test():
    """
    Makes a set of calls to find_dates() with parameters given in an array of test arguments, 'tests'.
    """
    tests = [['https://en.wikipedia.org/wiki/Linus_Pauling', 'Linus_Pauling_output.txt'],
            ['https://en.wikipedia.org/wiki/Rafael_Nadal', 'Rafael_Nadal_output.txt'],
            ['https://en.wikipedia.org/wiki/J._K._Rowling', 'J._K._Rowling_output.txt'],
            ['https://en.wikipedia.org/wiki/Richard_Feynman', 'Richard_Feynman_output.txt'],
            ['https://en.wikipedia.org/wiki/Hans_Rosling', 'Hans_Rosling_output.txt']]

    for test in tests:
        html = requesting_urls.get_html(test[0]).text
        output = test[1]
        find_dates(html, output=output)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: nicolossus/IN4110

def find_player(url):
    """
    Not finished
    """
    r = get_html(url)
    soup = BeautifulSoup(r.text, 'lxml')

    identifier = {"class": "wikitable sortable jquery-tablesorter"}
    table = soup.find("table", identifier)

    rows = table.find_all("tr")[1:]

    for row in rows:
        cells = row.find_all("td")
        if cells[0].text.strip() == "2019–20":
            print(True)

Ejemplo n.º 14

0

Mostrar archivo

def _extract_table(url, title):
    """Extracts the first table after title in HTML.

    Args:
        url (str): The URL to request and get table from.
        title (str): The title of the table.

    Returns:
        bs4.element.Tag: The first table after title. If not found, None is returned.
    """

    html = get_html(url).text
    soup = BeautifulSoup(html, "lxml")

    soup_title = soup.find(id=title)

    if soup_title is None:
        return None
    return soup_title.find_all_next("table")[0]

Ejemplo n.º 15

0

Mostrar archivo

def find_articles(url, params=None, output=None, saveToFolder='filter_urls'):
    '''
    Receives a url and returns a list of all urls to Wikipedia articles found on the page.
    If argument 'params' is specified, parameters are passed in the get request.
    If argument 'output' is specified, lists of all urls and wikipedia articles found will be saved to a text file with a specified name output.txt.
    If the specified output filename already exists, it overwrites the content.
    Output folder can be specified as saveToFolder, but defaults to filter_urls
    If no output argument is given, the list of wikipedia articles is simply returned.

    Args:
        url:            The url from which the html is retrieved.
        params:         (Optional) Parameters to pass to the get function. Defaults to None.
        output:         (Optional) The filename (format as 'output.txt') for where to save the url lists. Defaults to None.
        saveToFolder:   (Optional) The folder for where to save the output file. Defaults to 'filter_urls'
    Returns:
        <Array<String>> wiki_urls:  A list of the urls found of Wikipedia articles.
    '''
    # Call requesting_urls.get_html() with url and optional params. Save response.text to variable html, and pass to find_urls to get a list of all urls.
    response = requesting_urls.get_html(url, params=params)
    html = response.text
    all_urls = find_urls(url, html)

    # Use regex to find all wikipedia articles in the list of all urls (join iterable all_urls to one string with separator |)
    regex_wikiarticles = r"(https*://[\w]+.wikipedia.org/[^|:]*)"
    wiki_urls = re.findall(regex_wikiarticles,
                           '|'.join(all_urls),
                           flags=re.VERBOSE)

    # Save output to file if output is specified.
    if output is not None:
        with open(f'./{saveToFolder}/{output}', 'w', encoding='utf-8') as f:
            f.write('ALL URLS:\n')
            for url in all_urls:
                f.write(f'{url}\n')
            f.write('\nWIKIPEDIA ARTICLES:\n')
            for url in wiki_urls:
                f.write(f'{url}\n')

    # Always return the list of wikipedia articles
    return wiki_urls

Ejemplo n.º 16

0

Mostrar archivo

Archivo: wiki_race_challenge.py Proyecto: j-hermansen/in4110

def shortest_path(start, end):
    """ Function to find shortest path between two url's using BFS (Breadth First Search).

    :param start (str): The url to start from
    :param end (str):   The target url

    :return: Path of urls.
    """

    path = {}
    path[start] = [start]
    Q = deque([start])  # Double ended queue of pages to visit.

    while len(Q) != 0:

        page = Q.popleft()  # Check next page to visit

        print(page)

        html_content = get_html(page)  # Get html content
        links = find_urls(
            html_content[1], base_url=html_content[0]
        )  # First get all the links in page, html_content[0] is the base url
        articles = find_articles(links, language='en')  # Then get all articles

        # print(articles)

        for article in articles:  # Go through every article link on page

            if article == end:  # Check if article is destination
                return path[page] + [article]  # Done!

            if (article not in path) and (
                    article != page
            ):  # Checks if article not already are in path, or in current page
                path[article] = path[page] + [article]
                Q.append(article)

    return None  # Return none if all links (articles) are checked

Ejemplo n.º 17

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: nicolossus/IN4110

def find_team(url):
    """
    Extract team player URLs from HTML table.

    Arguments
    ---------
    url : str
        URL of webpage to get

    Returns
    -------
    links : list
        Found URLs of team players
    """
    r = get_html(url)
    soup = BeautifulSoup(r.text, 'lxml')
    base_url = "https://en.wikipedia.org"

    identifier = {"class": "toccolours"}
    outer_table = soup.find("table", identifier)

    table = outer_table.find("table")

    rows = table.find_all("tr")[1:]
    links = []
    for row in rows:
        cells = row.find_all("td")
        name = cells[2]
        # print(name)
        link = name.find('a')
        links.append(link.get('href'))

    # concatenate base url to relative URL's
    for i, link in enumerate(links):
        if link.startswith('/'):
            links[i] = base_url + link

    return links

Ejemplo n.º 18

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: ZinderAsh/Uni-WebCrawling

def get_teams(url):
    """
    Fetches a list of NBA teams in the semifinals from the NBA playoffs Wikipedia page
    Args:
        url (string): The URL of the Wikipedia page
    Returns:
        list: A list of the teams in the semifinals, where each item is a list of [team_url, team_name]
    """
    html = get_html(url)

    # Find the table containing the bracket
    soup = BeautifulSoup(html, "html.parser")
    soup_table = soup.find("table", {"border": "0"})

    # All winners are in bold text, and semifinals require one win,
    # so finding all bold names in the table find all semifinal teams
    # Put them in a set to remove duplicates
    teams = set(
        re.findall('<b>[^<]*<a href="([^"]*)"[^>]*>([^<]*)', str(soup_table)))

    # Remove results with "Conference" as these are noe actually teams in the table
    teams = [t for t in teams if not re.search("Conference", t[1])]
    return teams

Ejemplo n.º 19

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: andebraa/in4110

def extract_player_points(url, team):
    """
    Finding the players points for 2019-20.

    Args:
        url (string): nba player wikipedia site
        team (string): team of player. Used when players has played in two clubs that season.

    Returns:
        list: a list with the players name and points for that season
    """
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")
    name = soup.find("h1", attrs={"class": "firstHeading"}).text

    try:
        for soup_table in soup.find_all("table",
                                        attrs={"class": "wikitable sortable"}):
            try:
                for row in soup_table.tbody.find_all("tr")[1:]:
                    try:
                        if str(row.find("td").a.text).rstrip() == "2019–20":
                            elms = row.find_all("td")

                            if team in elms[1].text.rstrip():
                                return [
                                    name, elms[1].text.rstrip(),
                                    float(elms[-1].text.replace("*", "")),
                                    float(elms[-2].text),
                                    float(elms[-5].text)
                                ]
                    except Exception:
                        pass
            except Exception:
                pass
    except Exception:
        pass

Ejemplo n.º 20

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: ZinderAsh/Uni-WebCrawling

def get_player_stats(player):
    """
    Fetches the points/blocks/rebounds per game stats for a given Player
    Args:
        player: Player-object to fetch stats for and to write them back to
    """
    print("Fetching stats for", player.name)

    # Find the correct header for regular season stats
    html = get_html("https://en.wikipedia.org" + player.url)
    soup = BeautifulSoup(html, "html.parser")
    soup_header = soup.find("span", {"id": "Regular_season"})

    # If the header was not found, assume no stats for the season
    if soup_header:
        soup_table = soup_header.findNext("tbody")
        for row in soup_table.findAll("tr"):
            cells = row.findAll("td")
            # Row must have appropriate number of cells, and match the correct season date
            if len(cells) >= 13 and re.match("2019.20", cells[0].get_text()):
                # Extract the stats from each cell
                player.ppg = safe_float(cells[12].get_text())
                player.bpg = safe_float(cells[11].get_text())
                player.rpg = safe_float(cells[8].get_text())

Ejemplo n.º 21

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: lenaosterhus/IN3110

    urls = find_urls(html, base)

    # Remove URLs that are not normal Wikipedia articles (no special namespace articles or files)
    pattern_wiki = fr".*{language}\.wikipedia\.org/[^:]*$"
    articles = [url for url in urls if re.match(pattern_wiki, url)]

    if output is not None:
        with open(f"./filter_urls/{output}.txt", "w") as file:
            file.write(f"find_urls found {len(urls)} URLs\n")
            file.write("\n".join(urls))
            file.write("")
            file.write(f"\n\nfind_articles found {len(articles)} articles\n")
            file.write("\n".join(articles))

    return articles


if __name__ == "__main__":
    
    urls = ["https://en.wikipedia.org/wiki/Nobel_Prize",
            "https://en.wikipedia.org/wiki/Bundesliga",
            "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup"]

    outputs = ["Nobel_Prize", "Bundesliga", "World Cup"]

    for url, output in zip(urls, outputs):
        html = get_html(url).text

        find_articles(html, base="https://en.wikipedia.org", output=output)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: fetch_player_statistics.py Proyecto: nicolossus/IN4110

    identifier = {"class": "wikitable sortable jquery-tablesorter"}
    table = soup.find("table", identifier)

    rows = table.find_all("tr")[1:]

    for row in rows:
        cells = row.find_all("td")
        if cells[0].text.strip() == "2019–20":
            print(True)


if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/2020_NBA_playoffs"
    base_url = "https://en.wikipedia.org"
    r = get_html(url)
    soup = BeautifulSoup(r.text, 'lxml')

    start_loc = soup.find(id="Bracket")
    table = start_loc.find_next("table")

    links = extract_url(table, base_url)

    print("SEMINFINAL LINKS:")
    print("")
    for link in links:
        print(link)
    print("")
    print("TEAM PLAYERS:")
    print("")
    for link in links:

Ejemplo n.º 23

0

Mostrar archivo

Archivo: time_planner.py Proyecto: alexpresthus/web-scraping-python

def extract_events(url, createSlip=True):
    '''
    Takes a url and extracts data (date, venue, type) from the main table.
    If createSlip is True (default), creates an empty betting slip, saved to datetime_filter/betting_slip_empty.md
    Returns the data as an array of (date, venue, type) arrays.

    Args:
        url:        The url of which site to extract events from.
        createSlip: (Optional) Whether to create a betting slip with the extracted data. Defaults to True.
    Returns:
        Array<Array<String>> data:  Returns the set of data (array of each (date, venue, type) for each row) extracted.
    '''
    response = requesting_urls.get_html(url)
    document = BeautifulSoup(response.content, "lxml")
    table = document.find('table', {"class": 'wikitable plainrowheaders'})
    rows = table.find_all("tr")

    # Remove filler rows
    for row in rows:
        if (len(row.find_all(["td"])) < 2):
            rows.remove(row)

    # Array to keep (date, venue, type) values for each row
    data = []

    # Extract date, venue and type for each row
    for i in range(0, len(rows)):
        cells = rows[i].find_all(["td"])
        cells = cells[0:5]  # Limit to first 5 data cells

        cells_text = [cell.get_text(strip=True)
                      for cell in cells]  # Strip for html tags

        # Use regex to filter out the first cells until date, this being '#' 'event' and empty cells. Makes date the first cell
        while re.search(r"^[\d]*$",
                        cells_text[0],
                        flags=re.VERBOSE | re.MULTILINE):
            cells_text.pop(0)

        # Use regex to filter out all cells after type. Makes type the last cell
        for j in range(len(cells_text)):
            if re.search(r"[A-Z]{2}[\dcnx]{3}", cells_text[j]):
                cells_text = cells_text[:j + 1]
                break

        # Check length of cells_text. If 2, venue is missing and is set equals to previous row's venue
        if (len(cells_text) == 2 and i != 1):
            venue = data[i - 1][1]
            cells_text.insert(1, venue)

        # Add to data
        data.append(cells_text)

    # Format data
    for cells in data:
        (date, venue, type) = cells

        # Format date (extract only date string as D(D) Month YYYY)
        date = re.findall(r"([\d]{1,2} [\w]+ [\d]{4})", date, flags=0)[0]
        cells[0] = date

        # Format type (remove numbers after event key)
        type = re.sub(r"([\dcnx]{3})", '', type)
        cells[2] = type

    # Call createBettingSlip to create empty betting slip if createSlip==True
    if createSlip:
        createBettingSlip(data)

    # Always return data
    return data

Ejemplo n.º 24

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: ZinderAsh/Uni-WebCrawling

def write_to_file(output, urls):
    """
    Writes urls to file
    Args:
        output (string): filename to write to
        urls (list): list of urls to be written to file
    """
    f = open(output, "w")
    for i in urls:
        f.write(f"{i}\n")
    f.close()


if __name__ == "__main__":
    find_urls(get_html("https://en.wikipedia.org/wiki/Nobel_Prize"),
              "https://en.wikipedia.org/wiki/Nobel_Prize",
              "filter_urls/Nobel_Prize_urls.txt")
    find_urls(get_html("https://en.wikipedia.org/wiki/Bundesliga"),
              "https://en.wikipedia.org/wiki/Bundesliga",
              "filter_urls/Bundesliga_urls.txt")
    find_urls(
        get_html(
            "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup"
        ),
        "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup",
        "filter_urls/2019-20_FIS_Alpine_Ski_World_Cup_urls.txt")

    find_articles("https://en.wikipedia.org/wiki/Nobel_Prize",
                  "filter_urls/Nobel_Prize_articles.txt")
    find_articles("https://en.wikipedia.org/wiki/Bundesliga",

Ejemplo n.º 25

0

Mostrar archivo

Archivo: wiki_race_challenge.py Proyecto: ZinderAsh/Uni-WebCrawling

def wiki_race(start, goal, output=None, greed=2, threads=4, sleeptime=0.01):
    """
    Finds the a short path between two wikipedia pages.
    It usually finds a short path, but may not always find the shortest, as it can grow quite greedy.
    Tried BFS, that didn't terminate and Wikipedia started denying my requests, so here we are!
    Ranks all articles by their keyword scores, and goes to the most promising articles first.
    If no articles look promising, it performs regular BFS.
    Args:
        start (string): The starting Wikipedia Article to find the path from
        goal (string): The Wikipedia article to find the path to
        [output] (string): Optional output file to write the path to
        [greed] (int): Specified how what keywords to consider when scoring articles
            0: No keywords are considered, pure BFS (THIS WILL TAKE FOREVER)
            1: High Priority keywords only, this will probably take a long time
            2: Mid-High Priority, this usually finds the best path (DEFAULT)
            3: All keywords. Rarely finds the best path, but does find a path very quickly
        [threads] (int): The amount of threads, too many might make Wikipedia deny requests eventually
            Default: 4, this works well as long as the algorithm isn't called excessively
        [sleeptime] (float): The time each thread sleeps before moving to next thread, to avoid denied requests
    Returns:
        list: An list containing the path from the start to the goal
    """

    goal_html = get_html(goal)

    # All the articles contents are in <p> objects
    goal_points = re.findall('<p>.*', goal_html)

    # Concatenate the content
    goal_content = ""
    for p in goal_points:
        goal_content += p
        goal_content += "\n"

    # Find (possibly) important keywords in the articles
    keywords = [
        # Low Priority, any single word from Mid and High priority
        [],
        # Mid Priority, the titles of any hyperlinks in the article
        re.findall('title="([^"#]*)"', goal_content),
        # High Priority, text that is in bold, as well as the title of the article
        re.findall('<b>([^<]*)</b>', goal_content) +
        [goal[goal.rindex("/"):].replace("_", " ")]
    ]

    # Remove special characters from keywords
    for i in range(len(keywords[1])):
        keywords[1][i] = (''.join(e for e in keywords[1][i]
                                  if e.isalnum() or e == " ")).lower()
    for i in range(len(keywords[2])):
        keywords[2][i] = (''.join(e for e in keywords[2][i]
                                  if e.isalnum() or e == " ")).lower()

    # Put all words in Mid and High priority in Low priority list
    for k in keywords[1] + keywords[2]:
        keywords[0] += k.split(' ')

    # Empty the keyword lists based on the greed argument
    if greed <= 0:
        keywords[2] = []
    if greed <= 1:
        keywords[1] = []
    if greed <= 2:
        keywords[0] = []

    # Remove short keywords to avoid words like "for", "of", etc.
    for i in range(len(keywords)):
        keywords[i] = [s for s in keywords[i] if len(s) > 3]

    # Print out the keywords
    print("LOW PRIORITY")
    for i in keywords[0]:
        print(i)
    print("\nMID PRIORITY")
    for i in keywords[1]:
        print(i)
    print("\nHIGH PRIORITY")
    for i in keywords[2]:
        print(i)
    print("\nSCANNING ARTICLES")

    # Create the thread-safe queue and visited lists
    queue = ArticleList([Article(start)])
    visited = ArticleList([start])

    while True:
        article = queue.next_article()
        # Stop when no articles remain in the queue
        if not article:
            break

        # Find all links in the article, and create a thread-safe list
        links = ArticleList(find_articles(article.url))

        # Print the score, url and number of sub-articles as a progress update
        print(f"{article.score}: {article.url} ({len(links)} sub-articles)")

        # Make 4 threads that check all the sub-articles for keywords. Any more than 4, and Wikipedia starts denying requests at some point
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(wiki_thread, goal, article, queue, visited,
                                links, keywords) for i in range(threads)
            ]

        # Check if any thread reported a valid path
        results = [f.result() for f in futures]
        for r in results:
            if r:
                if output:
                    f = open(output, "w")
                    for i in r:
                        f.write(f"{i}\n")
                    f.close()
                return r

Ejemplo n.º 26

0

Mostrar archivo

Archivo: time_planner.py Proyecto: j-hermansen/in4110

def extract_events(url, output='betting_slip_empty'):
    """ Function to get ski events, and create an betting slip for all the races.

     Regex used in this function:
        ([0123]?[0-9] [A-Z][a-z]{2,8} [0-9]{4})         Find dates in DMY format.
                                                        DMY format: 13 October 2020

        ([DSGAP][HLSGC])                                Type of race.
                                                        Types: GS | SL | DH | SG | PG | AC




    :param url (str):       The url to send get-request to.
    :param output (str):    Name of output txt file.
    """

    html_content = get_html(url)

    soup = BeautifulSoup(
        html_content[1],
        "html.parser")  # html content is as index 1 (url on index 0)
    soup_table = soup.find('table', {"class": 'wikitable plainrowheaders'})
    soup_table_rows = soup_table.findAll('tr')

    events = []

    dmy_regex = '([0123]?[0-9] [A-Z][a-z]{2,8} [0-9]{4})'
    type_regex = '([DSGAP][HLSGC])'

    venue = ""

    # Search each row in table (tr)
    for i in range(len(soup_table_rows)):
        soup_table_row = soup_table_rows[i].findAll(
            'td')  # gets rows with relevant data, e.g. td-tags

        # Search each cell in row (td)
        for j in range(len(soup_table_row)):

            cell = soup_table_row[j]

            # Check if current cell is a date
            is_date = re.search(dmy_regex, cell.getText())

            if is_date is not None:
                event = []
                event.append(is_date.group(0))

                next_cell = soup_table_row[j + 1]

                # Check if next cell (after date) is type (if previous rows is using rowspan > 1)
                is_type = re.match(type_regex, next_cell.getText())

                if is_type is None:

                    # Get a-element that contains Venue
                    a_element = next_cell.findAll('a', recursive=False)

                    venue = a_element[0].text
                    type = re.match(type_regex,
                                    soup_table_row[j + 2].getText()).group(0)

                    event.append(venue)
                    event.append(type)

                else:

                    # Set Venue from last row, that spans multiple rows
                    event.append(venue)
                    event.append(is_type.group(0))

                events.append(event)

    # Create betting slip
    if not os.path.exists('datetime_filter'):
        os.makedirs('datetime_filter')

    betting_slip_file = open('datetime_filter/{}.md'.format(output),
                             'w',
                             encoding='utf-8')

    betting_slip_file.write('BETTING SLIP\n\n')
    betting_slip_file.write('Name: \n\n')

    betting_slip_file.write('|DATE|VANUE|DISCIPLINE|Who Wins?|\n')
    betting_slip_file.write('|-----|-----|-----|-----|\n')

    for event in events:
        betting_slip_file.write('|{}|{}|{}||\n'.format(event[0], event[1],
                                                       event[2]))

    betting_slip_file.close()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: fetch_playerstatistics.py Proyecto: j-hermansen/in4110

def extract_url(url):
    """ Function that extracts urls for all teams in NBA conference semifinals.

    :param url (str):    The url to send get-request to.
    """

    html_content = get_html(url)

    # Get table in bracket section
    document = BeautifulSoup(
        html_content[1], 'html.parser'
    )  # Parse html content at index 1 (url on index 0) from get_html()
    title = document.find(id="Bracket")  # Find 'Bracket' section
    tables = title.find_all_next("table")  # Find tables in 'Bracket' section
    bracket_table = tables[0]  # Get first table (Conference)

    # Extract teams that made it to the conference semifinals (4th column)
    # Rows 5, 7, 17, 19, 29, 31, 41, 43 (extracting 4th column of all rows could also select teams from 'Conference Finals')
    rows = []
    row = bracket_table.find_all("tr")

    rows.append(row[4])
    rows.append(row[6])
    rows.append(row[16])
    rows.append(row[18])
    rows.append(row[28])
    rows.append(row[30])
    rows.append(row[40])
    rows.append(row[42])

    teams_semifinals = []

    # Go through rows for conference semifinals
    for row in rows:
        cells = row.find_all("td")
        team_name = cells[3].get_text(strip=True)  # Team name is in 4th column
        team_name = re.sub(r'[^\w]', '', team_name)  # Remove unwanted symbols
        team_url = cells[3].a['href']  # Gets the team url from href-attribute

        team = [team_name, team_url]
        teams_semifinals.append(
            team
        )  # Add to multidim-list with teams that made it to conference semifinals

    top_players = []

    # Go through every team that made it to conference semifinals
    for team in teams_semifinals:
        team.append(
            team_players(team)
        )  # Add all team players to the team as a list inside team list

        top_players_team = [
            ['', '', 0, 0, 0], ['', '', 0, 0, 0], ['', '', 0, 0, 0]
        ]  # Should hold top team players (name, team, rpg, bpg, ppg)

        # Go through every player in team (team[2] is the list of players inside team list)
        for player in team[2]:
            player.append(get_player_points(
                player))  # Get player score for season 2019-20

            # Remove unwanted symbols
            player_points = re.sub(r'\*', '', str(player[2][2]))
            top_players_team[0][4] = re.sub(r'\*', '',
                                            str(top_players_team[0][4]))
            top_players_team[1][4] = re.sub(r'\*', '',
                                            str(top_players_team[1][4]))
            top_players_team[2][4] = re.sub(r'\*', '',
                                            str(top_players_team[2][4]))

            # Check if player is in top 3 at current team
            if float(player_points) > float(
                    top_players_team[0]
                [4]):  # Higher Points Per Game than first player
                top_players_team[2] = list(
                    top_players_team[1])  # Place second player in third
                top_players_team[1] = list(
                    top_players_team[0])  # Place first player in second

                top_players_team[0][0] = player[0]  # Set player name
                top_players_team[0][1] = team[0]  # Set player's team
                top_players_team[0][2] = player[2][0]  # Set rpg
                top_players_team[0][3] = player[2][1]  # Set bpg
                top_players_team[0][4] = player[2][2]  # Set ppg
            elif float(player_points) > float(
                    top_players_team[1]
                [4]):  # Higher Points Per Game than second player
                top_players_team[2] = list(
                    top_players_team[1])  # Place second player in third

                top_players_team[1][0] = player[0]  # Set player name
                top_players_team[1][1] = team[0]  # Set player's team
                top_players_team[1][2] = player[2][0]  # Set rpg
                top_players_team[1][3] = player[2][1]  # Set bpg
                top_players_team[1][4] = player[2][2]  # Set ppg
            elif float(player_points) > float(top_players_team[2][4]):
                top_players_team[2][0] = player[0]  # Set player name
                top_players_team[2][1] = team[0]  # Set player's team
                top_players_team[2][2] = player[2][0]  # Set rpg
                top_players_team[2][3] = player[2][1]  # Set bpg
                top_players_team[2][4] = player[2][2]  # Set ppg

        top_players.append(top_players_team)

    # Create plot image for Points Per Game, Blocks Per Game, and Rebounds Per Game
    plot_top_players(top_players, 'ppg')
    plot_top_players(top_players, 'bpg')
    plot_top_players(top_players, 'rpg')

Ejemplo n.º 28

0

Mostrar archivo

Archivo: collect_dates.py Proyecto: andebraa/in4110

    try:
        if from_format == "YMD":
            if date[5:8] in months:
                if date[-1].isnumeric():
                    return f"{date[:4]}/{str(months.index(date[5:8])+1).zfill(2)}/{date[-2:]}"
                return f"{date[:4]}/{str(months.index(date[5:8])).zfill(2)}"
        if from_format == "MDY":
            return f"{date[-4:]}/{str(months.index(date[0:3])+1).zfill(2)}/{date[-8:-6]}"
        if from_format == "DMY":
            return f"{date[-4:]}/{str(months.index(date[3:6])+1).zfill(2)}/{date[:2]}"
    except:  #Error in month
        pass


if __name__ == '__main__':

    html = get_html('https://en.wikipedia.org/wiki/Rafael_Nadal')
    find_dates(html, output='example_rafael_nadal.txt')

    html = get_html('https://en.wikipedia.org/wiki/Linus_Pauling')
    find_dates(html, output='example_linus_pauling.txt')

    html = get_html("https://en.wikipedia.org/wiki/J._K._Rowling")
    find_dates(html, output='example_just_kidding_rowling.txt')

    html = get_html("https://en.wikipedia.org/wiki/Richard_Feynman")
    find_dates(html, output='example_richard_feynman.txt')

    html = get_html("https://en.wikipedia.org/wiki/Hans_Rosling")
    find_dates(html, output='example_hans_rosling.txt')

Ejemplo n.º 29

0

Mostrar archivo

Archivo: filter_urls.py Proyecto: nicolossus/IN4110

    return matches


if __name__ == "__main__":

    # test cases
    urls = [
        "https://en.wikipedia.org/wiki/Nobel_Prize",
        "https://en.wikipedia.org/wiki/Bundesliga",
        "https://en.wikipedia.org/wiki/2019–20_FIS_Alpine_Ski"
    ]
    outputs = ["Nobel_Prize", "Bundesliga", "2019–20_FIS_Alpine_Ski"]

    path_all = "filter_urls/all_urls_"
    path_article = "filter_urls/articles_"
    base_url = "https://en.wikipedia.org"

    for url, output in zip(urls, outputs):
        r = get_html(url)
        m_all = find_urls(r.text, base_url=base_url, output=path_all + output)
        m_articles = find_articles(url, output=path_article + output)

    r = get_html("https://en.wikipedia.org/wiki/Studio_Ghibli")
    m = find_urls(r.text,
                  base_url="https://en.wikipedia.org",
                  output="filter_urls/all_urls_Studio_Ghibli")

    m = find_articles("https://en.wikipedia.org/wiki/Studio_Ghibli",
                      output="filter_urls/articles_Studio_Ghibli")

Ejemplo n.º 30

0

Mostrar archivo

    # Use the hashtag encapsulation to find only the previously formatted dates
    goal_format = "#([0-9]{4}/[0-9]{2}(?:/[0-9]{2})?)#"
    all_dates = re.findall(goal_format, html)
    all_dates.sort()

    # Write to file if argument was provided
    if output:
        write_to_file(output, all_dates)

    return all_dates

def write_to_file(output, dates):
    """
    Writes dates to file
    Args:
        output (string): filename to write to
        dates (list): list of dates to be written to file
    """
    f = open(output, "w")
    for d in dates:
        f.write(f"{d}\n")
    f.close()


if __name__ == "__main__":
    find_dates(get_html("https://en.wikipedia.org/wiki/Linus_Pauling"), "filter_dates_regex/Linus_Pauling.txt")
    find_dates(get_html("https://en.wikipedia.org/wiki/Rafael_Nadal"), "filter_dates_regex/Rafael_Nadal.txt")
    find_dates(get_html("https://en.wikipedia.org/wiki/J._K._Rowling"), "filter_dates_regex/J._K_Rowling.txt")
    find_dates(get_html("https://en.wikipedia.org/wiki/Richard_Feynman"), "filter_dates_regex/Richard_Feynman.txt")
    find_dates(get_html("https://en.wikipedia.org/wiki/Hans_Rosling"), "filter_dates_regex/Hans_Rosling.txt")