def headings(page): mediawikiapi = MediaWikiAPI() page = mediawikiapi.page(page) soup = BeautifulSoup(page.html(), 'html.parser') data = [] for headlines in soup.find_all("h3"): data.append(headlines.text.strip()[:headlines.text.strip().find(' (')]) return data
def wikitable(page): """ Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning: multirow and multicolumn should format as expected. """ mediawikiapi = MediaWikiAPI() page = mediawikiapi.page(page) soup = BeautifulSoup(page.html(), 'html.parser') rows = table.findAll("tr") ncols = max([len(r.findAll(['th', 'td'])) for r in rows]) # preallocate table structure # (this is required because we need to move forward in the table # structure once we've found a row span) data = [] for i in range(nrows): rowD = [] for j in range(ncols): rowD.append('') data.append(rowD) # fill the table with data: # move across cells and use span to fill extra cells for i, row in enumerate(rows): cells = row.findAll(["td", "th"]) for j, cell in enumerate(cells): cspan = int(cell.get('colspan', 1)) rspan = int(cell.get('rowspan', 1)) l = 0 for k in range(rspan): # Shifts to the first empty cell of this row # Avoid replacing previously insterted content while data[i + k][j + l]: l += 1 for m in range(cspan): data[i + k][j + l + m] += cell.text.strip("\n") return data
cspan=int(cell.get('colspan',1)) rspan=int(cell.get('rowspan',1)) l = 0 for k in range(rspan): # Shifts to the first empty cell of this row # Avoid replacing previously insterted content while data[i+k][j+l]: l+=1 for m in range(cspan): data[i+k][j+l+m]+=cell.text.strip("\n") return data mediawikiapi = MediaWikiAPI() test_page = mediawikiapi.page('List of video games notable for negative reception') # to check page URL: print(test_page.url) soup = BeautifulSoup(test_page.html(), 'html.parser') # tables = soup.findAll("table", { "class" : "wikitable" }) # headings = soup.findAll('h3') # df_test = wikitable_to_dataframe(tables[0]) # print(df_test) for headlines in soup.find_all("h3"): print(headlines.text.strip()[:headlines.text.strip().find(' (')]) def headings(page): mediawikiapi = MediaWikiAPI() page = mediawikiapi.page(page)
# print(names) toStore = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] def remove_accents(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) if mode == "wiki": mediawikiapi = MediaWikiAPI() countries = mediawikiapi.page("List_of_national_capitals") table = pd.read_html(countries.url, attrs={"class": "wikitable"})[0] names = table["City/Town"] else: with open("in.txt", "r", encoding="utf8") as f: names = f.readlines() for name in names: name = re.sub('\d*', '', name).lower().strip() second = remove_accents(name) toStore["abcdefghijklmnopqrstuvwxyz".index(name[0].lower())].append(name)