Exemple #1
0
def headings(page):
    mediawikiapi = MediaWikiAPI()
    page = mediawikiapi.page(page)
    soup = BeautifulSoup(page.html(), 'html.parser')
    data = []
    for headlines in soup.find_all("h3"):
        data.append(headlines.text.strip()[:headlines.text.strip().find(' (')])
    return data
Exemple #2
0
def wikitable(page):
    """
    Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning: 
    multirow and multicolumn should format as expected. 
    """
    mediawikiapi = MediaWikiAPI()
    page = mediawikiapi.page(page)
    soup = BeautifulSoup(page.html(), 'html.parser')
    rows = table.findAll("tr")
    ncols = max([len(r.findAll(['th', 'td'])) for r in rows])

    # preallocate table structure
    # (this is required because we need to move forward in the table
    # structure once we've found a row span)
    data = []
    for i in range(nrows):
        rowD = []
        for j in range(ncols):
            rowD.append('')
        data.append(rowD)

    # fill the table with data:
    # move across cells and use span to fill extra cells
    for i, row in enumerate(rows):
        cells = row.findAll(["td", "th"])
        for j, cell in enumerate(cells):
            cspan = int(cell.get('colspan', 1))
            rspan = int(cell.get('rowspan', 1))
            l = 0
            for k in range(rspan):
                # Shifts to the first empty cell of this row
                # Avoid replacing previously insterted content
                while data[i + k][j + l]:
                    l += 1
                for m in range(cspan):
                    data[i + k][j + l + m] += cell.text.strip("\n")
    return data
Exemple #3
0
            cspan=int(cell.get('colspan',1))
            rspan=int(cell.get('rowspan',1))
            l = 0
            for k in range(rspan):
                # Shifts to the first empty cell of this row
                # Avoid replacing previously insterted content
                while data[i+k][j+l]:
                    l+=1
                for m in range(cspan):
                    data[i+k][j+l+m]+=cell.text.strip("\n")

    return data


mediawikiapi = MediaWikiAPI()
test_page = mediawikiapi.page('List of video games notable for negative reception')
# to check page URL:
print(test_page.url)
soup = BeautifulSoup(test_page.html(), 'html.parser')
# tables = soup.findAll("table", { "class" : "wikitable" })
# headings = soup.findAll('h3')
# df_test = wikitable_to_dataframe(tables[0])
# print(df_test)

for headlines in soup.find_all("h3"):
    print(headlines.text.strip()[:headlines.text.strip().find(' (')])


def headings(page):
    mediawikiapi = MediaWikiAPI()
    page = mediawikiapi.page(page)
Exemple #4
0
# print(names)

toStore = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
           [], [], [], [], [], [], [], [], []]


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])


if mode == "wiki":
    mediawikiapi = MediaWikiAPI()

    countries = mediawikiapi.page("List_of_national_capitals")

    table = pd.read_html(countries.url, attrs={"class": "wikitable"})[0]
    names = table["City/Town"]

else:
    with open("in.txt", "r", encoding="utf8") as f:
        names = f.readlines()

for name in names:

    name = re.sub('\d*', '', name).lower().strip()
    second = remove_accents(name)

    toStore["abcdefghijklmnopqrstuvwxyz".index(name[0].lower())].append(name)