def _page_img_links(page_title, min_size, max_size): """Return all image links from a Wikipedia page. Args: page_title: A string containing the page title. min_size: An integer describing the minimum dimensions that an image must have (both height and width) in order to be returned. max_size: The maximum size allowed to be returned by the function. Returns: A tuple of the image links as a list and the image (max) sizes. """ import xml.etree.ElementTree as ET import wiki data = wiki.get_wiki_json(page_title) tree = ET.fromstring(data['text']['*']) req_keys = set(['alt', 'src', 'width', 'height']) img_links = [] sizes = [] for img in tree.findall(".//img"): attributes = set(img.attrib.keys()) if len(attributes.intersection(req_keys)) == 4: width = int(img.attrib['width']) height = int(img.attrib['height']) size = max(height, width) if min_size <= size <= max_size: if img.attrib['src'][-3:] in ['jpg', 'png']: img_links.append("https://" + img.attrib['src'][2:]) sizes.append(max(height, width)) return img_links, sizes
def _wiki_page_revisions(page_title): import requests base_api_url = 'https://' + 'en' + '.wikipedia.org/w/api.php?' page_json = wiki.get_wiki_json(page_title) pageid = page_json['pageid'] revid = page_json['revid'] api_query = base_api_url + \ "action=query&" + "format=json&" + \ "prop=revisions&" + "rvprop=ids|size|timestamp|comment|user&" \ "rvlimit=max&" + \ "pageids={0:d}&".format(pageid) + \ "rvstartid={0:d}&".format(revid) req = requests.get(api_query) page_data = req.json() rev_data = page_data['query']['pages'][str(pageid)]['revisions'] while 'continue' in page_data: rvcontinue = page_data['continue']['rvcontinue'] api_query_continue = api_query + \ "rvcontinue={0:s}&".format(rvcontinue) req = requests.get(api_query_continue) page_data = req.json() rev_data += page_data['query']['pages'][str(pageid)]['revisions'] msg = "Loaded {0:d} revisions, through {1:s}" print(msg.format(len(rev_data), rev_data[-1]['timestamp'])) return rev_data
def link_to_geo(link): """Extract geolocation information from a Wikipedia page. Args: link: A string giving the link to a Wikipedia page. Results: Either None, if no results found, or a tuple giving the (lat, lon) from the page's metadata. """ data = wiki.get_wiki_json(link) tree = ET.fromstring(data['text']['*']) geo = tree.find(".//span[@class='geo']") if geo is None: return None result = re.split(';', geo.text) if len(result) == 2: try: lat = float(result[0]) lon = float(result[1]) except ValueError: return None return lat, lon
def link_to_section(link): """Extract internal Wikipedia links by section. Args: link: A string giving the link to a Wikipedia page. Results: A list of dictionaries, one for each heading. """ data = wiki.get_wiki_json(link) tree = ET.fromstring(data['text']['*']) output = [] temp = [] heading = "Header" for child in tree: if child.tag == "p": temp.append("".join(list(child.itertext()))) elif child.tag == "h2": if temp: output.append( dict(heading=heading, text=clean_text("".join(temp)))) temp = [] heading = child.find('.//span[@class="mw-headline"]') if heading is not None: heading = heading.text else: heading = "" if temp: output.append(dict(heading=heading, text=clean_text("".join(temp)))) return output
def _compute_meta_dataframe(links): """Convert links to a pandas DataFrame object """ import pandas as pd from wiki import get_wiki_json meta = dict(link=[], title=[], doc=[], num_sections=[], num_images=[], num_ilinks=[], num_elinks=[], num_langs=[], langs=[], ilinks=[]) for link in links: data = get_wiki_json(link) tree = ET.fromstring(data['text']['*']) meta['link'].append(re.sub(' ', '_', data['title'])) meta['title'].append(re.sub('<[^>]+>', '', data['displaytitle'])) meta['doc'].append(_tree_to_doc(tree)) meta['num_sections'].append(len(data['sections'])) meta['num_images'].append(len(data['images'])) meta['num_ilinks'].append(len(data['links'])) meta['num_elinks'].append(len(data['externallinks'])) meta['num_langs'].append(len(data['langlinks'])) meta['langs'].append([x['lang'] for x in data['langlinks']]) meta['ilinks'].append([re.sub(' ', '_', x['*']) for x in data['links'] if x['ns'] == 0]) lat, lon = _tree_to_geo(tree) meta['lat'] = lat meta['lon'] = lon pdf = pd.DataFrame(meta).drop_duplicates(subset='link', keep="first") return pdf.reset_index()
def _compute_meta_dataframe(links): """Convert links to a pandas DataFrame object """ import pandas as pd from wiki import get_wiki_json meta = dict(link=[], title=[], doc=[], first_p=[], num_sections=[], num_images=[], num_ilinks=[], num_elinks=[], num_langs=[], langs=[], ilinks=[], first_img=[]) for link in links: data = get_wiki_json(link) tree = ET.fromstring(data['text']['*']) meta['link'].append(re.sub(' ', '_', data['title'])) meta['title'].append(re.sub('<[^>]+>', '', data['displaytitle'])) next_doc, next_first_p = tree_to_doc(tree) meta['doc'].append(next_doc) meta['first_p'].append(next_first_p) meta['num_sections'].append(len(data['sections'])) meta['num_images'].append(len(data['images'])) meta['num_ilinks'].append(len(data['links'])) meta['num_elinks'].append(len(data['externallinks'])) meta['num_langs'].append(len(data['langlinks'])) meta['langs'].append([(x['lang'], x['url']) for x in data['langlinks']]) meta['ilinks'].append( [re.sub(' ', '_', x['*']) for x in data['links'] if x['ns'] == 0]) lat, lon = _tree_to_geo(tree) meta['lat'] = lat meta['lon'] = lon # add first image to the dataset first_img = '' for item in tree.findall('.//img'): if int(item.attrib.get('width', 0)) >= 150: first_img = 'https:' + item.attrib['src'] break meta['first_img'].append(first_img) meta['eigen'] = _compute_centrality(links, meta) pdf = pd.DataFrame(meta) return pdf.reset_index()
def get_data(labels=None): init_dir(train_dir) init_dir(test_dir) if labels is None: with (data_dir / db_filename).open() as f: labels = json.load(f) data = [] for key, value in _zip(TRAIN, "train") + _zip(TEST, "test"): output = get_wiki_json(WIKI_MAP[key]) output["y_true"] = labels[key] output["type"] = value output["title"] = key data.append(output) with (data_dir / value / f"{key}.json").open("w+") as f: json.dump(output, f, indent=4) with (data_dir / 'train_test.json').open("w+") as f: json.dump(data, f) return data
def link_to_p(link): """Returns each paragraph in Wikipedia page as a string. Args: link: A string giving the link to a Wikipedia page. Results: A list of non-empty strings. """ data = wiki.get_wiki_json(link) tree = ET.fromstring(data['text']['*']) output = [] for child in tree.findall('.//p'): text = "".join(list(child.itertext())) text = clean_text(text) if text: output.append(text) return output
def link_to_lilinks(link): """Extract internal Wikipedia links from paragraphs. Args: link: A string giving the link to a Wikipedia page. Results: A list of unique internal links. """ data = wiki.get_wiki_json(link) tree = ET.fromstring(data['text']['*']) output = [] for ilink in tree.findall(".//li/a"): if 'href' in ilink.attrib: href = ilink.attrib['href'] if href[:6] == "/wiki/": output.append(href[6:]) ilinks = [re.sub(' ', '_', x) for x in wiki.links_as_list(data)] output = list(set(output).intersection(ilinks)) return sorted(output)
def get_internal_links(data): """Extract internal Wikipedia links. Args: data: Either a string describing the name of a Wikipedia page or a dictionary object already pulled with `wiki.get_wiki_json`. Returns: A dictionary with three elements: 'ilinks' (all of the internal links for the page), 'ilinks_p' (links from the page found inside paragraph tags), and 'ilinks_li' (links found inside list items). All links are checked to make sure they actually exist. """ from wiki import get_wiki_json if isinstance(data, str): data = get_wiki_json(data) ilinks = [x['*'] for x in data['links'] if x['ns'] == 0 and 'exists' in x] ilinks = [re.sub(' ', '_', x) for x in ilinks] tree = ET.fromstring(data['text']['*']) output_p = [] for child in tree: if child.tag == "p": for ilink in child.findall(".//a"): href = ilink.attrib['href'] if href[:6] == "/wiki/": output_p.append(href[6:]) output_li = [] for ilink in tree.findall(".//li/a"): if 'href' in ilink.attrib: href = ilink.attrib['href'] if href[:6] == "/wiki/": output_li.append(href[6:]) output_p = sorted(list(set(output_p).intersection(ilinks))) output_li = sorted(list(set(output_li).intersection(ilinks))) return dict(ilinks=ilinks, ilinks_p=output_p, ilinks_li=output_li)
create_zip_file(rr_links, 'birthday-cake') # richmond, va data_json = wiki.download_wiki_json("Richmond,_Virginia") rr_links = wiki.links_as_list(data_json) create_zip_file(rr_links, 'richmond-va') # philosophy links_us = wikitext.get_internal_links( 'List_of_important_publications_in_philosophy') create_zip_file(links_us['ilinks'], 'philosophy') # impressionists page_links = wikitext.get_internal_links("Impressionism")['ilinks'] + [ "Impressionism" ] create_zip_file(page_links, 'impressionists-text') # novelists and poets data = wiki.get_wiki_json("List_of_American_novelists") data_html = data['text']['*'] authors = re.findall('<li><a href="/wiki/([^"]+)"', data_html) nov_authors = authors[:(authors.index('Leane_Zugsmith') + 1)] data = wiki.get_wiki_json("List_of_poets_from_the_United_States") data_html = data['text']['*'] authors = re.findall('<li><a href="/wiki/([^"]+)"', data_html) poe_authors = authors[:(authors.index('Louis_Zukofsky') + 1)] create_zip_file(nov_authors + poe_authors, "novel-poem")