def merge_task(task_list, args, pid): context = [] outputname = 'LIST-{}'.format(pid) for wt in tqdm(task_list, desc='pages-{}'.format(pid), mininterval=30): try: tables = import_tables(wt) except: continue if len(tables) > 1: title = parse(wt[8:].lower()) if title != '': ent_list = [] for row in tables[0].rows: key = list(row.keys())[0] if key == 'Rank': key = list(row.keys())[1] ent = parse(str(row[key]).lower()) if ent != '': ent_list.append(ent) if len(ent_list) >= 5: context.append( json.dumps({ 'title': title, 'ents': ent_list })) time.sleep(3) if context != []: with open('{}/{}'.format(args.output_dir, outputname), "w+") as f: f.write('\n'.join(context)) f.close()
def get_highest_grossing_franchises(): tbs = wikitables.import_tables( 'List of best-selling video game franchises') categories = { 0: '100 millions', 1: '50 millions', 2: '20 millions', 3: '10 millions', 4: '5 millions' } games = [] grossing_category = [] for i, tb in enumerate(tbs): fn = [row.get('Franchise name') for row in tb.rows][::2] games.extend(fn) grossing_category.extend([categories.get(i)] * len(fn)) df = pd.DataFrame.from_dict({ 'franchise': games, 'copies_sold': grossing_category, }) return df
def main(): parser = ArgumentParser(description='wikitables v%s' % version) parser.add_argument('-l','--lang', dest='lang', help='article language (default: %(default)s)', default='en') parser.add_argument('-p','--pretty', action='store_true', help='pretty-print json output') parser.add_argument('-d', '--debug', action='store_true', help='enable debug output') parser.add_argument('article', help='article title') args = parser.parse_args() if not args.article: print('usage: wikitables <article title>') sys.exit(1) if args.debug: logging.basicConfig(level=logging.DEBUG) log.debug('debug logging enabled') else: logging.basicConfig(level=logging.WARN) from wikitables import import_tables from wikitables.util import jprint tables = import_tables(args.article, lang=args.lang) d = { t.name:t.rows for t in tables } if args.pretty: jprint(d) else: print(json.dumps(d, cls=TableJSONEncoder))
def get_djia_tickers(): tables = import_tables("Dow Jones Industrial Average") data = pd.DataFrame(tables[0].rows) data = data.astype(str) str_cols = data.select_dtypes(['object']) data[str_cols.columns] = str_cols.apply(lambda x: x.str.strip()) return data.reset_index()[['Symbol', 'Industry']]
def get_sp500_tickers(): """ Get financial descriptors for S&P500 stocks Returns: dict(dict(pd.DataFrame)): the descriptors by stocks and categories """ tables = import_tables("List of S&P 500 companies") data = pd.DataFrame(tables[0].rows) data = data.astype(str) str_cols = data.select_dtypes(['object']) data[str_cols.columns] = str_cols.apply(lambda x: x.str.strip()) return data.reset_index()[['Ticker symbol', 'GICS Sector']]
def find_event_links(): """Find url links to all events""" events = import_tables("List of UFC events")[1] event_links = [] for event in events.rows: name = str(event["Event"]) # Wikipedia markup links are either [link] or [link|text] link = re.findall(r"\[\[(.*?)\|", name) if (link): event_links.append(link[0]) continue link = re.findall(r"\[\[(.*?)\]\]", name) if (link): event_links.append(link[0]) continue event_links.append(name) return event_links
def main(): parser = ArgumentParser(description='wikitables v%s' % version) parser.add_argument('-l', '--lang', dest='lang', help='article language (default: %(default)s)', default='en') parser.add_argument('-p', '--pretty', action='store_true', help='pretty-print json output') parser.add_argument('-d', '--debug', action='store_true', help='enable debug output') parser.add_argument('article', help='article title') args = parser.parse_args() if not args.article: print('usage: wikitables <article title>') sys.exit(1) if args.debug: logging.basicConfig(level=logging.DEBUG) log.debug('debug logging enabled') else: logging.basicConfig(level=logging.WARN) from wikitables import import_tables from wikitables.util import jprint tables = import_tables(args.article, lang=args.lang) d = {t.name: t.rows for t in tables} if args.pretty: jprint(d) else: print(json.dumps(d, cls=TableJSONEncoder))
from wikitables import import_tables import json import sys reload(sys) sys.setdefaultencoding('utf-8') tables = import_tables('List_of_people_who_died_climbing_Mount_Everest', 'en') t = tables[0].json() with open('everest-deaths.txt', 'w') as outfile: json.dump(t, outfile)
import wikipedia from wikitables import import_tables tables = import_tables( 'List of cities in Italy') #returns a list of WikiTable objects # # def wiki_search(query): # # r = requests.get(API_URL, params=params, headers=headers) # # raw_results = _wiki_request(search_params) def wiki_lookup(answer): try: ny = wikipedia.page(answer) except: return "" return ny # question = "Which of these is a common material used in 3D printers" # wiki_lookup("3D printers") # def get_wikis(answers): # answer_wikis = {} # for answer in answers: # answer_wiki = wiki_lookup(answer) # answer_wikis[answer] = answer_wiki #
from wikitables import import_tables import wptools import wikipedia import json import requests allTeamsTable = import_tables('List_of_Formula_One_constructors') teams = [] for row in allTeamsTable[0].rows: teams.append(({ "name": '{Constructor}'.format(**row), "engine": '{Engine}'.format(**row), "from": '{Based in}'.format(**row), "seasons": '{Seasons}'.format(**row), "race_entries": '{Races Entered}'.format(**row), "race_start": '{Races Started}'.format(**row), "drivers": '{Drivers}'.format(**row), "total_entries": '{Total Entries}'.format(**row), "wins": '{Wins}'.format(**row), "points": '{Points}'.format(**row), "poles": '{Poles}'.format(**row), "fastest_laps": '{FL}'.format(**row), "wcc": '{WCC}'.format(**row), "wdc": '{WDC}'.format(**row), "related_teams": '{Antecedent teams}'.format(**row), "active": True, })) for row in allTeamsTable[1].rows: teams.append(({
def main(in_directory): # google maps api key gmaps_api = 'AIzaSyAm4vrzQVlfDxV2L13PuWgHEiwVaS8IPHc' # load osm data osmpath = os.path.join(in_directory, 'amenities-vancouver.json') osm_data = pd.read_json(osmpath, lines=True) # load wikidata #wikiclient = Client() #wikient = wikiclient.get('Q3472954', load=True) # load wikipedia data wiki = wk.Wikipedia('en') can_chains = wiki.page('List of Canadian restaurant chains') can_chains = pd.DataFrame( {'restaurant_name': can_chains.sections[0].sections}) can_chains = can_chains.applymap(lambda x: x.title) wiki_table = import_tables( 'List of restaurant chains in the United States') us_chains = pd.DataFrame(wiki_table[0].rows) # necessary loop due to wikitable library for table in wiki_table[1:29]: us_chains = us_chains.append(pd.DataFrame(table.rows), sort=False) us_chains = us_chains.reset_index() # gather restaurant and fast food osm data osm_rest = osm_data[osm_data['amenity'].isin(['restaurant', 'fast_food'])] # drop any restaurants without names osm_rest = osm_rest.dropna(subset=['name']) # detect chains utilizing multiple methods # first method, find restaurants that occur multiple times in osm data osm_chains = osm_rest[osm_rest.duplicated(['name'])] osm_indep = osm_rest[~osm_rest.duplicated(['name'])] # second method, include restaurants in list of US and CAN chains osm_chains_can = osm_rest[osm_rest['name'].isin( can_chains['restaurant_name'])] osm_chains_us = osm_rest[osm_rest['name'].isin(us_chains['Name'])] # combine lists osm_chains_can = pd.concat( [osm_chains_can, osm_chains_us], sort='True') #.drop_duplicates().reset_index(drop=True) osm_chains_can = osm_chains_can.drop_duplicates( subset=['lat', 'lon']).reset_index() # combine methods osm_chains = pd.concat( [osm_chains, osm_chains_can], sort='True') #.drop_duplicates().reset_index(drop=True) osm_chains = osm_chains.drop_duplicates( subset=['lat', 'lon']).reset_index() # segment osm data into grid # find bounds of grid ymin = osm_rest['lat'].min() ymax = osm_rest['lat'].max() xmin = osm_rest['lon'].min() xmax = osm_rest['lon'].max() # build grid grid_density = 120 x_grid = np.linspace(xmin, xmax, grid_density * 1.3) y_grid = np.linspace(ymin, ymax, grid_density) dens_rest, x_edges, y_edges = np.histogram2d(osm_rest['lon'].values, osm_rest['lat'].values, bins=[x_grid, y_grid]) dens_chain, _, _ = np.histogram2d(osm_chains['lon'].values, osm_chains['lat'].values, bins=[x_grid, y_grid]) dens_indep, _, _ = np.histogram2d(osm_indep['lon'].values, osm_indep['lat'].values, bins=[x_grid, y_grid]) dens_rest = dens_rest.T dens_chain = dens_chain.T dens_indep = dens_indep.T # density of chains over independents dens_chain_vs_indep = dens_chain - dens_indep dens_chain_vs_indep[dens_chain_vs_indep == 0] = np.nan x_mesh, y_mesh = np.meshgrid(x_edges, y_edges) # plot heatmap fig, ax = plt.subplots() ax.set_axis_off() # setup separate colors for locations with more chains vs independents min_dens = np.nanmin(dens_chain_vs_indep) max_dens = np.nanmax(dens_chain_vs_indep) color_chain = plt.cm.coolwarm(np.linspace(0.15, 0, max_dens)) color_indep = plt.cm.coolwarm(np.linspace(0.5, 0.95, abs(min_dens))) color_chain_indep = np.vstack((color_indep, color_chain)) color_chain_indep = colors.LinearSegmentedColormap.from_list( 'color_chain_indep', color_chain_indep) # seperate colormap for chains and independent restaurants divnorm = colors.TwoSlopeNorm(vmin=min_dens, vcenter=0, vmax=max_dens) # plot heatmap ax.pcolormesh(x_mesh, y_mesh, dens_chain_vs_indep, cmap=color_chain_indep, norm=divnorm) # save heatmap img_bound = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) fig.savefig('chainvsindep_heatmap.png', format='png', transparent=True, bbox_inches=img_bound, pad_inches=0) # overlap heatmap on google map lat_mid = np.mean(osm_rest['lat']) lon_mid = np.mean(osm_rest['lon']) gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api) # adjust for grid spacing grid_size = grid_density / (grid_density - 1) north = (ymax - lat_mid) * grid_size + lat_mid south = (ymin - lat_mid) * grid_size + lat_mid east = (xmax - lon_mid) * grid_size + lon_mid west = (xmin - lon_mid) * grid_size + lon_mid bounds = {'north': north, 'south': south, 'east': east, 'west': west} gmap.ground_overlay('chainvsindep_heatmap.png', bounds, opacity=0.8) gmap.draw('map_chainvsindep.html') # heatmap for chains and independts separately on two maps gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api) gmap.heatmap(osm_chains['lat'].values, osm_chains['lon'].values) gmap.draw('map_chain.html') gmap.heatmap(osm_indep['lat'].values, osm_indep['lon'].values) gmap.draw('map_indep.html') ############################################################ #### heatmap zoom in ### grid_density = 300 x_grid = np.linspace(xmin, xmax, grid_density * 1.5) y_grid = np.linspace(ymin, ymax, grid_density) dens_rest, x_edges, y_edges = np.histogram2d(osm_rest['lon'].values, osm_rest['lat'].values, bins=[x_grid, y_grid]) dens_chain, _, _ = np.histogram2d(osm_chains['lon'].values, osm_chains['lat'].values, bins=[x_grid, y_grid]) dens_indep, _, _ = np.histogram2d(osm_indep['lon'].values, osm_indep['lat'].values, bins=[x_grid, y_grid]) dens_rest = dens_rest.T dens_chain = dens_chain.T dens_indep = dens_indep.T # density of chains over independents dens_chain_vs_indep = dens_chain - dens_indep dens_chain_vs_indep[dens_chain_vs_indep == 0] = np.nan x_mesh, y_mesh = np.meshgrid(x_edges, y_edges) # plot heatmap fig, ax = plt.subplots() ax.set_axis_off() # setup separate colors for locations with more chains vs independents min_dens = np.nanmin(dens_chain_vs_indep) max_dens = np.nanmax(dens_chain_vs_indep) color_chain = plt.cm.coolwarm(np.linspace(0.15, 0, max_dens)) color_indep = plt.cm.coolwarm(np.linspace(0.5, 0.95, abs(min_dens))) color_chain_indep = np.vstack((color_indep, color_chain)) color_chain_indep = colors.LinearSegmentedColormap.from_list( 'color_chain_indep', color_chain_indep) # seperate colormap for chains and independent restaurants divnorm = colors.TwoSlopeNorm(vmin=min_dens, vcenter=0, vmax=max_dens) # plot heatmap ax.pcolormesh(x_mesh, y_mesh, dens_chain_vs_indep, cmap=color_chain_indep, norm=divnorm) # save heatmap img_bound = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) fig.savefig('chainvsindep_heatmap_zoom.png', format='png', dpi=1000, transparent=True, bbox_inches=img_bound, pad_inches=0) # overlap heatmap on google map lat_mid = np.mean(osm_rest['lat']) lon_mid = np.mean(osm_rest['lon']) gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api) # adjust for grid spacing grid_size = grid_density / (grid_density - 1) north = (ymax - lat_mid) * grid_size + lat_mid south = (ymin - lat_mid) * grid_size + lat_mid east = (xmax - lon_mid) * grid_size + lon_mid west = (xmin - lon_mid) * grid_size + lon_mid bounds = {'north': north, 'south': south, 'east': east, 'west': west} gmap.ground_overlay('chainvsindep_heatmap_zoom.png', bounds, opacity=0.8) gmap.draw('map_chainvsindep_zoom.html')
from wikitables import import_tables import wikipedia import json import requests print("Hello, world!") allDriversTable = import_tables('List of Formula One drivers') WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' def get_wiki_image(search_term): try: result = wikipedia.search(search_term, results=1) wikipedia.set_lang('en') wkpage = wikipedia.WikipediaPage(title=result[0]) title = wkpage.title response = requests.get(WIKI_REQUEST + title) json_data = json.loads(response.text) img_link = list( json_data['query']['pages'].values())[0]['original']['source'] return img_link except: return 0 drivers = [] def show_todo(): for row in allDriversTable[1].rows: