def extract_player_urls(url): """ Finding all the players urls from a nba team page Args: url (string): Returns: list: all player urls """ html = get_html(url) soup = BeautifulSoup(html, "html.parser") team_name = soup.find("h1", attrs={"id": "firstHeading"}).text team_name = team_name[8:-7] #Takes away 2019-20 and season from title soup_table = soup.find("table", attrs={"class": "toccolours"}) soup_table_rows = soup_table.find_all("tr")[2].td.table.tbody.find_all( "tr") column_names = [ row.find("td", attrs={"style": "text-align:left;"}) for row in soup_table_rows ] column_names.remove(None) player_cells_html = [str(row.find("a")) for row in column_names] player_urls = filter_urls(html="\n".join(player_cells_html), base_url="https://en.wikipedia.org") return player_urls
def extract_url(url): """ Extracting team names from all the teams in semifinals Args: url (string): Returns: team_names: list - of all teams team_urls: list - of each teams url """ html = get_html(url) soup = BeautifulSoup(html, "html.parser") soup_table = soup.find( "table", attrs={"style": "font-size: 90%; margin:1em 2em 1em 1em;"}) soup_table_rows = soup_table.find_all("tr") soup_table_rows = soup_table_rows[4:] soup_table_rows = soup_table_rows[::12] + soup_table_rows[2::12] team_cells_html = [str(row.find("a")) for row in soup_table_rows] team_names = [row.find("a").text for row in soup_table_rows] team_names = [ name if len(name.split()) == 1 else name.split()[-1] for name in team_names ] team_urls = filter_urls(html="\n".join(team_cells_html), base_url="https://en.wikipedia.org") return team_names, team_urls
def extract_events(url): """ Finding season table and making .md file Args: url (string): """ html = get_html(url) soup = BeautifulSoup(html, "html.parser") table = soup.find('table', attrs={"class": 'wikitable'}) new_table = table_to_2d(table) discipline = { 'DH': 'Downhill', 'SL': 'Slalom', 'GS': 'Giant Slalom', 'SG': 'Super Giant Slalom', 'AC': 'Alpine Combined', 'PG': 'Parallel Giant Slalom' } f = open('betting_slip_empty.md', 'w') f.write('*date*|*venue*|*discipline*|Who wins?*\n') f.write(':-----|:-----:|:-----:|-----:') f.write('\n') for row in new_table[1:-1]: try: f.write( f"{row[2].strip()}|{row[3].strip()}|{discipline[row[4][:2]]}\n" ) except: pass f.close()
def team_players(team_url): """ Function gets all player on a team. :param team_url (list): Team info [name, relative url]. :return: All players on team. """ # Get html html_content = get_html('https://en.wikipedia.org{}'.format(team_url[1])) document = BeautifulSoup( html_content[1], 'html.parser' ) # Parse html content at index 1 (url on index 0) from get_html() player_table = document.find_all("table", class_='toccolours')[0].find( "table") # Gets inner table containing players of 2019-20 team table rows = player_table.find_all("tr") players = [] # Go through each row except first (th-tags) for row in rows[1:]: cells = row.find_all("td") player_name = cells[2].get_text(strip=True) player_url = cells[2].a['href'] player = [player_name, player_url] players.append(player) return players
def find_articles(url, output=None): """ Finds all Wikipedia article links within a Wikipedia page. Args: url (string): The URL of the wikipedia page to fetch [output] (string): Optional filename to write urls to Returns: list: List of Wikipedia article URLs """ html = get_html(url) urls = find_urls(html, url) # Article test, URL must either be relative or have a wikipedia URL is_wiki_url = re.compile('(?:^|wikipedia.org)/wiki/') # Check if the URL is a namespace (contains a colon) is_namespace = re.compile('https://[^:]*:') # Create a list containing only URLs that passes the article test and is not a namespace article_urls = [ url for url in urls if is_wiki_url.search(url) and not is_namespace.search(url) ] if output: write_to_file(output, article_urls) return article_urls
def filter_urls(html, base_url, output = False): """ uses regex to identify valid urls in HTML code args: html (file): file with HTML code base_url (string, optional): url from which html argument is fetched output (string, optional): name of output file urls are written to returns: urls (list): List of valid urls in string format """ if html == None and base_url!=None: #only website argument passed. try: html =get_html(base_url) except TypeError: print("could not fetch html from base_url argument.") #regular_links = re.findall(r"<a\s+href=\"(https:\/\/.\w+.\w+.\w{2,3}[\/\w+]*)", html) Does not work regular_links = re.findall("(?=<a).*href=\"([h|\/]{1}[^\"#]*)", html) for ind, url in enumerate(regular_links): if url[0] == '/' and url[1] != '/': regular_links[ind] = base_url + url elif url[0] == '/': regular_links[ind] = "https:" + url if output: #write url string to file f = open(output, 'w') for elem in regular_links: f.write(f"{elem}\n") f.close() return regular_links
def get_players(team_url, team_name, limit=None): """ Gets a list of Players for an NBA team, and each of their stats Args: team_url (string): The url for the team's wikipedia page team_name (string): The name of the team. This is assigned to each Player [limit] (int): Limits the list of Players to the top [limit] players by points per game Returns: list: A list of Players for the team """ # Find the table for the Roster html = get_html("https://en.wikipedia.org" + team_url) soup = BeautifulSoup(html, "html.parser") soup_header = soup.find("span", {"id": "Roster"}) soup_table = soup_header.findNext("tbody").findNext("tbody") players = [] for row in soup_table.findAll("tr"): cells = row.findAll("td") # Row must contain a minimum of 2 cells to have a player name if len(cells) > 2: a = cells[2].find("a") # Create a Player-object with the name and url of the player player = Player(a["title"], team_name, a["href"]) get_player_stats(player) players.append(player) # Limit the list to top [limit] players by points per game if the argument is given if limit: players.sort(reverse=True) players = players[:limit] return players
def find_articles(url, params=None, output=None, **kwargs): """ Find wikipedia article URL's linked in Wikipedia webpage. Request to fetch data from Wikipedia website of choice, find all anchored hyperlinks and filter out wikipedia article URL's only. Arguments --------- url : str URL of (Wikipedia) webpage to get params : dict, optional, default None Data to send in the URL's query string output : str, optional, default None Optional output filename to store the found article URL's. Filename can be specified without extension as .txt will be added regardless whether specified or not. **kwargs Arbitrary keyword arguments are passed along to requests.get in get_html() Returns ------- matches : list Found URL's Raises ------ TypeError : if 'output' is not str """ # response object r = get_html(url, params=params, **kwargs) # define regex pattern for base URL base_url_pattern = r'^.+?[^\/:](?=[?\/]|$)' # extract base URL from URL base_url = re.search(base_url_pattern, url).group(0) # find all anchored hyperlinks from raw HTML all_urls_list = find_urls(r.text, base_url=base_url) # define regex pattern for wikipedia articles only article_pattern = re.compile(r'^https?://(?!.*:[^_]).*wiki.*') # search for pattern match in list matches = list(filter(article_pattern.match, all_urls_list)) if output is not None: if not isinstance(output, str): raise TypeError("'output' must be str") filename, file_extension = os.path.splitext(output) if file_extension != ".txt": file_extension = ".txt" filename = filename + file_extension with open(filename, "w") as f: for line in matches: f.write(line + "\n") return matches
def wiki_thread(goal, article, queue, visited, sub_articles, keywords, sleeptime=0.01): """ Checks all sub articles from links in an article, grants them scores, and puts them in the queue Args: goal (string): The article it is trying to find a path to article (Article): The article the links were fetched from queue (ArticleList): The queue to insert sub articles into visited (ArticleList): The list of previusly visited articles sub_articles (ArticleList): The list of articles to be checked keywords (list): List of keywords to score articles by sleeptime (float): Time to sleep between each article """ title_exp = re.compile("<title>(.+) -.+</title>") while True: time.sleep(sleeptime) # Slight delay to avoid denied responses l = sub_articles.next_article() # Stop when article list is empty if not l: break # Only search if link has not been visited already if not visited.contains(l): visited.append(l) # Add to visited list if l == goal: # The correct link was found! print("Done!") sub_articles.clear() return Article(l, article).path elif "en.w" in l and "/Main_Page" not in l: # Only check english wiki links and do not go to the Main Page. No cheating! # Tries to get the html repeatedly until the request is accepted. # In case a request is denied. html = None while not html: try: html = get_html(l) except: pass content = html.lower() item = Article(l, article) # Check for keywords in the HTML, to grant the Article a score for i in range(len(keywords)): for k in keywords[i]: if k in content: # Grant points if keywords are in the HTML item.score += [1, 10, 50][i] if k.replace(" ", "_") in l.lower(): # Grant more points if keywords are in the URL item.score += [5, 30, 100][i] # Insert Article into search queue in accordance to its score inserted = queue.insert(item)
def get_player_points(player_url): """ Function that gets player stats for Rebounds Per Game, Blocks Per Game, and Points Per Game, for last season. :param player_url (list): Player info [name, relative url]. :return: Player stats (rpg, bpg, and ppg) for last season. """ # Get html html_content = get_html('https://en.wikipedia.org{}'.format(player_url[1])) # Get table of NBA regular season document = BeautifulSoup(html_content[1], 'html.parser') regular_season_section = document.find( 'span', id='Regular_season') # Find section for 'Regular season' if regular_season_section is None: # Some player sites do not include span with id 'Regular_season' regular_season_section = document.find( 'span', id='NBA') # Instead, find section for 'NBA' if regular_season_section is None: # Return if no section for a season in NBA was found return [0, 0, 0] # Return zero list table = regular_season_section.find_next( 'table' ).tbody # Get the tbody section of table, where the player seasons are rows = table.find_all("tr") last_season = '' # Go through each season for row in rows: cells = row.find_all(["td", "th"]) cells_text = [cell.get_text(strip=True) for cell in cells] # Get year 2019-20 if re.match( '2019[^\w]20', str(cells_text[0]) ): # No match with regular '-' symbol in 2019-20, so checked with regex last_season = cells_text # Save season 2019-20 break # Get players rpg, bpg, and ppg, for last season (2019-20), or set to zeros if no score are presented. if last_season != '': rpg = last_season[8] bpg = last_season[11] ppg = last_season[12] player_stats = [rpg, bpg, ppg] else: player_stats = [0, 0, 0] return player_stats
def _get_tables(url): """Gets the 'wikitable plainrowheaders'-tables from URL. Args: url (str): The URL to parse for 'wikitable plainrowheaders'-tables. Returns: bs4.element.ResultSet: The tables found in the URL. """ html = get_html(url).text soup = BeautifulSoup(html, "html.parser") return soup.find_all("table", class_="wikitable plainrowheaders")
def test(): """ Makes a set of calls to find_dates() with parameters given in an array of test arguments, 'tests'. """ tests = [['https://en.wikipedia.org/wiki/Linus_Pauling', 'Linus_Pauling_output.txt'], ['https://en.wikipedia.org/wiki/Rafael_Nadal', 'Rafael_Nadal_output.txt'], ['https://en.wikipedia.org/wiki/J._K._Rowling', 'J._K._Rowling_output.txt'], ['https://en.wikipedia.org/wiki/Richard_Feynman', 'Richard_Feynman_output.txt'], ['https://en.wikipedia.org/wiki/Hans_Rosling', 'Hans_Rosling_output.txt']] for test in tests: html = requesting_urls.get_html(test[0]).text output = test[1] find_dates(html, output=output)
def find_player(url): """ Not finished """ r = get_html(url) soup = BeautifulSoup(r.text, 'lxml') identifier = {"class": "wikitable sortable jquery-tablesorter"} table = soup.find("table", identifier) rows = table.find_all("tr")[1:] for row in rows: cells = row.find_all("td") if cells[0].text.strip() == "2019–20": print(True)
def _extract_table(url, title): """Extracts the first table after title in HTML. Args: url (str): The URL to request and get table from. title (str): The title of the table. Returns: bs4.element.Tag: The first table after title. If not found, None is returned. """ html = get_html(url).text soup = BeautifulSoup(html, "lxml") soup_title = soup.find(id=title) if soup_title is None: return None return soup_title.find_all_next("table")[0]
def find_articles(url, params=None, output=None, saveToFolder='filter_urls'): ''' Receives a url and returns a list of all urls to Wikipedia articles found on the page. If argument 'params' is specified, parameters are passed in the get request. If argument 'output' is specified, lists of all urls and wikipedia articles found will be saved to a text file with a specified name output.txt. If the specified output filename already exists, it overwrites the content. Output folder can be specified as saveToFolder, but defaults to filter_urls If no output argument is given, the list of wikipedia articles is simply returned. Args: url: The url from which the html is retrieved. params: (Optional) Parameters to pass to the get function. Defaults to None. output: (Optional) The filename (format as 'output.txt') for where to save the url lists. Defaults to None. saveToFolder: (Optional) The folder for where to save the output file. Defaults to 'filter_urls' Returns: <Array<String>> wiki_urls: A list of the urls found of Wikipedia articles. ''' # Call requesting_urls.get_html() with url and optional params. Save response.text to variable html, and pass to find_urls to get a list of all urls. response = requesting_urls.get_html(url, params=params) html = response.text all_urls = find_urls(url, html) # Use regex to find all wikipedia articles in the list of all urls (join iterable all_urls to one string with separator |) regex_wikiarticles = r"(https*://[\w]+.wikipedia.org/[^|:]*)" wiki_urls = re.findall(regex_wikiarticles, '|'.join(all_urls), flags=re.VERBOSE) # Save output to file if output is specified. if output is not None: with open(f'./{saveToFolder}/{output}', 'w', encoding='utf-8') as f: f.write('ALL URLS:\n') for url in all_urls: f.write(f'{url}\n') f.write('\nWIKIPEDIA ARTICLES:\n') for url in wiki_urls: f.write(f'{url}\n') # Always return the list of wikipedia articles return wiki_urls
def shortest_path(start, end): """ Function to find shortest path between two url's using BFS (Breadth First Search). :param start (str): The url to start from :param end (str): The target url :return: Path of urls. """ path = {} path[start] = [start] Q = deque([start]) # Double ended queue of pages to visit. while len(Q) != 0: page = Q.popleft() # Check next page to visit print(page) html_content = get_html(page) # Get html content links = find_urls( html_content[1], base_url=html_content[0] ) # First get all the links in page, html_content[0] is the base url articles = find_articles(links, language='en') # Then get all articles # print(articles) for article in articles: # Go through every article link on page if article == end: # Check if article is destination return path[page] + [article] # Done! if (article not in path) and ( article != page ): # Checks if article not already are in path, or in current page path[article] = path[page] + [article] Q.append(article) return None # Return none if all links (articles) are checked
def find_team(url): """ Extract team player URLs from HTML table. Arguments --------- url : str URL of webpage to get Returns ------- links : list Found URLs of team players """ r = get_html(url) soup = BeautifulSoup(r.text, 'lxml') base_url = "https://en.wikipedia.org" identifier = {"class": "toccolours"} outer_table = soup.find("table", identifier) table = outer_table.find("table") rows = table.find_all("tr")[1:] links = [] for row in rows: cells = row.find_all("td") name = cells[2] # print(name) link = name.find('a') links.append(link.get('href')) # concatenate base url to relative URL's for i, link in enumerate(links): if link.startswith('/'): links[i] = base_url + link return links
def get_teams(url): """ Fetches a list of NBA teams in the semifinals from the NBA playoffs Wikipedia page Args: url (string): The URL of the Wikipedia page Returns: list: A list of the teams in the semifinals, where each item is a list of [team_url, team_name] """ html = get_html(url) # Find the table containing the bracket soup = BeautifulSoup(html, "html.parser") soup_table = soup.find("table", {"border": "0"}) # All winners are in bold text, and semifinals require one win, # so finding all bold names in the table find all semifinal teams # Put them in a set to remove duplicates teams = set( re.findall('<b>[^<]*<a href="([^"]*)"[^>]*>([^<]*)', str(soup_table))) # Remove results with "Conference" as these are noe actually teams in the table teams = [t for t in teams if not re.search("Conference", t[1])] return teams
def extract_player_points(url, team): """ Finding the players points for 2019-20. Args: url (string): nba player wikipedia site team (string): team of player. Used when players has played in two clubs that season. Returns: list: a list with the players name and points for that season """ html = get_html(url) soup = BeautifulSoup(html, "html.parser") name = soup.find("h1", attrs={"class": "firstHeading"}).text try: for soup_table in soup.find_all("table", attrs={"class": "wikitable sortable"}): try: for row in soup_table.tbody.find_all("tr")[1:]: try: if str(row.find("td").a.text).rstrip() == "2019–20": elms = row.find_all("td") if team in elms[1].text.rstrip(): return [ name, elms[1].text.rstrip(), float(elms[-1].text.replace("*", "")), float(elms[-2].text), float(elms[-5].text) ] except Exception: pass except Exception: pass except Exception: pass
def get_player_stats(player): """ Fetches the points/blocks/rebounds per game stats for a given Player Args: player: Player-object to fetch stats for and to write them back to """ print("Fetching stats for", player.name) # Find the correct header for regular season stats html = get_html("https://en.wikipedia.org" + player.url) soup = BeautifulSoup(html, "html.parser") soup_header = soup.find("span", {"id": "Regular_season"}) # If the header was not found, assume no stats for the season if soup_header: soup_table = soup_header.findNext("tbody") for row in soup_table.findAll("tr"): cells = row.findAll("td") # Row must have appropriate number of cells, and match the correct season date if len(cells) >= 13 and re.match("2019.20", cells[0].get_text()): # Extract the stats from each cell player.ppg = safe_float(cells[12].get_text()) player.bpg = safe_float(cells[11].get_text()) player.rpg = safe_float(cells[8].get_text())
urls = find_urls(html, base) # Remove URLs that are not normal Wikipedia articles (no special namespace articles or files) pattern_wiki = fr".*{language}\.wikipedia\.org/[^:]*$" articles = [url for url in urls if re.match(pattern_wiki, url)] if output is not None: with open(f"./filter_urls/{output}.txt", "w") as file: file.write(f"find_urls found {len(urls)} URLs\n") file.write("\n".join(urls)) file.write("") file.write(f"\n\nfind_articles found {len(articles)} articles\n") file.write("\n".join(articles)) return articles if __name__ == "__main__": urls = ["https://en.wikipedia.org/wiki/Nobel_Prize", "https://en.wikipedia.org/wiki/Bundesliga", "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup"] outputs = ["Nobel_Prize", "Bundesliga", "World Cup"] for url, output in zip(urls, outputs): html = get_html(url).text find_articles(html, base="https://en.wikipedia.org", output=output)
identifier = {"class": "wikitable sortable jquery-tablesorter"} table = soup.find("table", identifier) rows = table.find_all("tr")[1:] for row in rows: cells = row.find_all("td") if cells[0].text.strip() == "2019–20": print(True) if __name__ == "__main__": url = "https://en.wikipedia.org/wiki/2020_NBA_playoffs" base_url = "https://en.wikipedia.org" r = get_html(url) soup = BeautifulSoup(r.text, 'lxml') start_loc = soup.find(id="Bracket") table = start_loc.find_next("table") links = extract_url(table, base_url) print("SEMINFINAL LINKS:") print("") for link in links: print(link) print("") print("TEAM PLAYERS:") print("") for link in links:
def extract_events(url, createSlip=True): ''' Takes a url and extracts data (date, venue, type) from the main table. If createSlip is True (default), creates an empty betting slip, saved to datetime_filter/betting_slip_empty.md Returns the data as an array of (date, venue, type) arrays. Args: url: The url of which site to extract events from. createSlip: (Optional) Whether to create a betting slip with the extracted data. Defaults to True. Returns: Array<Array<String>> data: Returns the set of data (array of each (date, venue, type) for each row) extracted. ''' response = requesting_urls.get_html(url) document = BeautifulSoup(response.content, "lxml") table = document.find('table', {"class": 'wikitable plainrowheaders'}) rows = table.find_all("tr") # Remove filler rows for row in rows: if (len(row.find_all(["td"])) < 2): rows.remove(row) # Array to keep (date, venue, type) values for each row data = [] # Extract date, venue and type for each row for i in range(0, len(rows)): cells = rows[i].find_all(["td"]) cells = cells[0:5] # Limit to first 5 data cells cells_text = [cell.get_text(strip=True) for cell in cells] # Strip for html tags # Use regex to filter out the first cells until date, this being '#' 'event' and empty cells. Makes date the first cell while re.search(r"^[\d]*$", cells_text[0], flags=re.VERBOSE | re.MULTILINE): cells_text.pop(0) # Use regex to filter out all cells after type. Makes type the last cell for j in range(len(cells_text)): if re.search(r"[A-Z]{2}[\dcnx]{3}", cells_text[j]): cells_text = cells_text[:j + 1] break # Check length of cells_text. If 2, venue is missing and is set equals to previous row's venue if (len(cells_text) == 2 and i != 1): venue = data[i - 1][1] cells_text.insert(1, venue) # Add to data data.append(cells_text) # Format data for cells in data: (date, venue, type) = cells # Format date (extract only date string as D(D) Month YYYY) date = re.findall(r"([\d]{1,2} [\w]+ [\d]{4})", date, flags=0)[0] cells[0] = date # Format type (remove numbers after event key) type = re.sub(r"([\dcnx]{3})", '', type) cells[2] = type # Call createBettingSlip to create empty betting slip if createSlip==True if createSlip: createBettingSlip(data) # Always return data return data
def write_to_file(output, urls): """ Writes urls to file Args: output (string): filename to write to urls (list): list of urls to be written to file """ f = open(output, "w") for i in urls: f.write(f"{i}\n") f.close() if __name__ == "__main__": find_urls(get_html("https://en.wikipedia.org/wiki/Nobel_Prize"), "https://en.wikipedia.org/wiki/Nobel_Prize", "filter_urls/Nobel_Prize_urls.txt") find_urls(get_html("https://en.wikipedia.org/wiki/Bundesliga"), "https://en.wikipedia.org/wiki/Bundesliga", "filter_urls/Bundesliga_urls.txt") find_urls( get_html( "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup" ), "https://en.wikipedia.org/wiki/2019%E2%80%9320_FIS_Alpine_Ski_World_Cup", "filter_urls/2019-20_FIS_Alpine_Ski_World_Cup_urls.txt") find_articles("https://en.wikipedia.org/wiki/Nobel_Prize", "filter_urls/Nobel_Prize_articles.txt") find_articles("https://en.wikipedia.org/wiki/Bundesliga",
def wiki_race(start, goal, output=None, greed=2, threads=4, sleeptime=0.01): """ Finds the a short path between two wikipedia pages. It usually finds a short path, but may not always find the shortest, as it can grow quite greedy. Tried BFS, that didn't terminate and Wikipedia started denying my requests, so here we are! Ranks all articles by their keyword scores, and goes to the most promising articles first. If no articles look promising, it performs regular BFS. Args: start (string): The starting Wikipedia Article to find the path from goal (string): The Wikipedia article to find the path to [output] (string): Optional output file to write the path to [greed] (int): Specified how what keywords to consider when scoring articles 0: No keywords are considered, pure BFS (THIS WILL TAKE FOREVER) 1: High Priority keywords only, this will probably take a long time 2: Mid-High Priority, this usually finds the best path (DEFAULT) 3: All keywords. Rarely finds the best path, but does find a path very quickly [threads] (int): The amount of threads, too many might make Wikipedia deny requests eventually Default: 4, this works well as long as the algorithm isn't called excessively [sleeptime] (float): The time each thread sleeps before moving to next thread, to avoid denied requests Returns: list: An list containing the path from the start to the goal """ goal_html = get_html(goal) # All the articles contents are in <p> objects goal_points = re.findall('<p>.*', goal_html) # Concatenate the content goal_content = "" for p in goal_points: goal_content += p goal_content += "\n" # Find (possibly) important keywords in the articles keywords = [ # Low Priority, any single word from Mid and High priority [], # Mid Priority, the titles of any hyperlinks in the article re.findall('title="([^"#]*)"', goal_content), # High Priority, text that is in bold, as well as the title of the article re.findall('<b>([^<]*)</b>', goal_content) + [goal[goal.rindex("/"):].replace("_", " ")] ] # Remove special characters from keywords for i in range(len(keywords[1])): keywords[1][i] = (''.join(e for e in keywords[1][i] if e.isalnum() or e == " ")).lower() for i in range(len(keywords[2])): keywords[2][i] = (''.join(e for e in keywords[2][i] if e.isalnum() or e == " ")).lower() # Put all words in Mid and High priority in Low priority list for k in keywords[1] + keywords[2]: keywords[0] += k.split(' ') # Empty the keyword lists based on the greed argument if greed <= 0: keywords[2] = [] if greed <= 1: keywords[1] = [] if greed <= 2: keywords[0] = [] # Remove short keywords to avoid words like "for", "of", etc. for i in range(len(keywords)): keywords[i] = [s for s in keywords[i] if len(s) > 3] # Print out the keywords print("LOW PRIORITY") for i in keywords[0]: print(i) print("\nMID PRIORITY") for i in keywords[1]: print(i) print("\nHIGH PRIORITY") for i in keywords[2]: print(i) print("\nSCANNING ARTICLES") # Create the thread-safe queue and visited lists queue = ArticleList([Article(start)]) visited = ArticleList([start]) while True: article = queue.next_article() # Stop when no articles remain in the queue if not article: break # Find all links in the article, and create a thread-safe list links = ArticleList(find_articles(article.url)) # Print the score, url and number of sub-articles as a progress update print(f"{article.score}: {article.url} ({len(links)} sub-articles)") # Make 4 threads that check all the sub-articles for keywords. Any more than 4, and Wikipedia starts denying requests at some point with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(wiki_thread, goal, article, queue, visited, links, keywords) for i in range(threads) ] # Check if any thread reported a valid path results = [f.result() for f in futures] for r in results: if r: if output: f = open(output, "w") for i in r: f.write(f"{i}\n") f.close() return r
def extract_events(url, output='betting_slip_empty'): """ Function to get ski events, and create an betting slip for all the races. Regex used in this function: ([0123]?[0-9] [A-Z][a-z]{2,8} [0-9]{4}) Find dates in DMY format. DMY format: 13 October 2020 ([DSGAP][HLSGC]) Type of race. Types: GS | SL | DH | SG | PG | AC :param url (str): The url to send get-request to. :param output (str): Name of output txt file. """ html_content = get_html(url) soup = BeautifulSoup( html_content[1], "html.parser") # html content is as index 1 (url on index 0) soup_table = soup.find('table', {"class": 'wikitable plainrowheaders'}) soup_table_rows = soup_table.findAll('tr') events = [] dmy_regex = '([0123]?[0-9] [A-Z][a-z]{2,8} [0-9]{4})' type_regex = '([DSGAP][HLSGC])' venue = "" # Search each row in table (tr) for i in range(len(soup_table_rows)): soup_table_row = soup_table_rows[i].findAll( 'td') # gets rows with relevant data, e.g. td-tags # Search each cell in row (td) for j in range(len(soup_table_row)): cell = soup_table_row[j] # Check if current cell is a date is_date = re.search(dmy_regex, cell.getText()) if is_date is not None: event = [] event.append(is_date.group(0)) next_cell = soup_table_row[j + 1] # Check if next cell (after date) is type (if previous rows is using rowspan > 1) is_type = re.match(type_regex, next_cell.getText()) if is_type is None: # Get a-element that contains Venue a_element = next_cell.findAll('a', recursive=False) venue = a_element[0].text type = re.match(type_regex, soup_table_row[j + 2].getText()).group(0) event.append(venue) event.append(type) else: # Set Venue from last row, that spans multiple rows event.append(venue) event.append(is_type.group(0)) events.append(event) # Create betting slip if not os.path.exists('datetime_filter'): os.makedirs('datetime_filter') betting_slip_file = open('datetime_filter/{}.md'.format(output), 'w', encoding='utf-8') betting_slip_file.write('BETTING SLIP\n\n') betting_slip_file.write('Name: \n\n') betting_slip_file.write('|DATE|VANUE|DISCIPLINE|Who Wins?|\n') betting_slip_file.write('|-----|-----|-----|-----|\n') for event in events: betting_slip_file.write('|{}|{}|{}||\n'.format(event[0], event[1], event[2])) betting_slip_file.close()
def extract_url(url): """ Function that extracts urls for all teams in NBA conference semifinals. :param url (str): The url to send get-request to. """ html_content = get_html(url) # Get table in bracket section document = BeautifulSoup( html_content[1], 'html.parser' ) # Parse html content at index 1 (url on index 0) from get_html() title = document.find(id="Bracket") # Find 'Bracket' section tables = title.find_all_next("table") # Find tables in 'Bracket' section bracket_table = tables[0] # Get first table (Conference) # Extract teams that made it to the conference semifinals (4th column) # Rows 5, 7, 17, 19, 29, 31, 41, 43 (extracting 4th column of all rows could also select teams from 'Conference Finals') rows = [] row = bracket_table.find_all("tr") rows.append(row[4]) rows.append(row[6]) rows.append(row[16]) rows.append(row[18]) rows.append(row[28]) rows.append(row[30]) rows.append(row[40]) rows.append(row[42]) teams_semifinals = [] # Go through rows for conference semifinals for row in rows: cells = row.find_all("td") team_name = cells[3].get_text(strip=True) # Team name is in 4th column team_name = re.sub(r'[^\w]', '', team_name) # Remove unwanted symbols team_url = cells[3].a['href'] # Gets the team url from href-attribute team = [team_name, team_url] teams_semifinals.append( team ) # Add to multidim-list with teams that made it to conference semifinals top_players = [] # Go through every team that made it to conference semifinals for team in teams_semifinals: team.append( team_players(team) ) # Add all team players to the team as a list inside team list top_players_team = [ ['', '', 0, 0, 0], ['', '', 0, 0, 0], ['', '', 0, 0, 0] ] # Should hold top team players (name, team, rpg, bpg, ppg) # Go through every player in team (team[2] is the list of players inside team list) for player in team[2]: player.append(get_player_points( player)) # Get player score for season 2019-20 # Remove unwanted symbols player_points = re.sub(r'\*', '', str(player[2][2])) top_players_team[0][4] = re.sub(r'\*', '', str(top_players_team[0][4])) top_players_team[1][4] = re.sub(r'\*', '', str(top_players_team[1][4])) top_players_team[2][4] = re.sub(r'\*', '', str(top_players_team[2][4])) # Check if player is in top 3 at current team if float(player_points) > float( top_players_team[0] [4]): # Higher Points Per Game than first player top_players_team[2] = list( top_players_team[1]) # Place second player in third top_players_team[1] = list( top_players_team[0]) # Place first player in second top_players_team[0][0] = player[0] # Set player name top_players_team[0][1] = team[0] # Set player's team top_players_team[0][2] = player[2][0] # Set rpg top_players_team[0][3] = player[2][1] # Set bpg top_players_team[0][4] = player[2][2] # Set ppg elif float(player_points) > float( top_players_team[1] [4]): # Higher Points Per Game than second player top_players_team[2] = list( top_players_team[1]) # Place second player in third top_players_team[1][0] = player[0] # Set player name top_players_team[1][1] = team[0] # Set player's team top_players_team[1][2] = player[2][0] # Set rpg top_players_team[1][3] = player[2][1] # Set bpg top_players_team[1][4] = player[2][2] # Set ppg elif float(player_points) > float(top_players_team[2][4]): top_players_team[2][0] = player[0] # Set player name top_players_team[2][1] = team[0] # Set player's team top_players_team[2][2] = player[2][0] # Set rpg top_players_team[2][3] = player[2][1] # Set bpg top_players_team[2][4] = player[2][2] # Set ppg top_players.append(top_players_team) # Create plot image for Points Per Game, Blocks Per Game, and Rebounds Per Game plot_top_players(top_players, 'ppg') plot_top_players(top_players, 'bpg') plot_top_players(top_players, 'rpg')
try: if from_format == "YMD": if date[5:8] in months: if date[-1].isnumeric(): return f"{date[:4]}/{str(months.index(date[5:8])+1).zfill(2)}/{date[-2:]}" return f"{date[:4]}/{str(months.index(date[5:8])).zfill(2)}" if from_format == "MDY": return f"{date[-4:]}/{str(months.index(date[0:3])+1).zfill(2)}/{date[-8:-6]}" if from_format == "DMY": return f"{date[-4:]}/{str(months.index(date[3:6])+1).zfill(2)}/{date[:2]}" except: #Error in month pass if __name__ == '__main__': html = get_html('https://en.wikipedia.org/wiki/Rafael_Nadal') find_dates(html, output='example_rafael_nadal.txt') html = get_html('https://en.wikipedia.org/wiki/Linus_Pauling') find_dates(html, output='example_linus_pauling.txt') html = get_html("https://en.wikipedia.org/wiki/J._K._Rowling") find_dates(html, output='example_just_kidding_rowling.txt') html = get_html("https://en.wikipedia.org/wiki/Richard_Feynman") find_dates(html, output='example_richard_feynman.txt') html = get_html("https://en.wikipedia.org/wiki/Hans_Rosling") find_dates(html, output='example_hans_rosling.txt')
return matches if __name__ == "__main__": # test cases urls = [ "https://en.wikipedia.org/wiki/Nobel_Prize", "https://en.wikipedia.org/wiki/Bundesliga", "https://en.wikipedia.org/wiki/2019–20_FIS_Alpine_Ski" ] outputs = ["Nobel_Prize", "Bundesliga", "2019–20_FIS_Alpine_Ski"] path_all = "filter_urls/all_urls_" path_article = "filter_urls/articles_" base_url = "https://en.wikipedia.org" for url, output in zip(urls, outputs): r = get_html(url) m_all = find_urls(r.text, base_url=base_url, output=path_all + output) m_articles = find_articles(url, output=path_article + output) r = get_html("https://en.wikipedia.org/wiki/Studio_Ghibli") m = find_urls(r.text, base_url="https://en.wikipedia.org", output="filter_urls/all_urls_Studio_Ghibli") m = find_articles("https://en.wikipedia.org/wiki/Studio_Ghibli", output="filter_urls/articles_Studio_Ghibli")
# Use the hashtag encapsulation to find only the previously formatted dates goal_format = "#([0-9]{4}/[0-9]{2}(?:/[0-9]{2})?)#" all_dates = re.findall(goal_format, html) all_dates.sort() # Write to file if argument was provided if output: write_to_file(output, all_dates) return all_dates def write_to_file(output, dates): """ Writes dates to file Args: output (string): filename to write to dates (list): list of dates to be written to file """ f = open(output, "w") for d in dates: f.write(f"{d}\n") f.close() if __name__ == "__main__": find_dates(get_html("https://en.wikipedia.org/wiki/Linus_Pauling"), "filter_dates_regex/Linus_Pauling.txt") find_dates(get_html("https://en.wikipedia.org/wiki/Rafael_Nadal"), "filter_dates_regex/Rafael_Nadal.txt") find_dates(get_html("https://en.wikipedia.org/wiki/J._K._Rowling"), "filter_dates_regex/J._K_Rowling.txt") find_dates(get_html("https://en.wikipedia.org/wiki/Richard_Feynman"), "filter_dates_regex/Richard_Feynman.txt") find_dates(get_html("https://en.wikipedia.org/wiki/Hans_Rosling"), "filter_dates_regex/Hans_Rosling.txt")