Beispiel #1
0
def merge_task(task_list, args, pid):
    context = []
    outputname = 'LIST-{}'.format(pid)
    for wt in tqdm(task_list, desc='pages-{}'.format(pid), mininterval=30):
        try:
            tables = import_tables(wt)
        except:
            continue
        if len(tables) > 1:
            title = parse(wt[8:].lower())
            if title != '':
                ent_list = []
                for row in tables[0].rows:
                    key = list(row.keys())[0]
                    if key == 'Rank':
                        key = list(row.keys())[1]
                    ent = parse(str(row[key]).lower())
                    if ent != '':
                        ent_list.append(ent)
                if len(ent_list) >= 5:
                    context.append(
                        json.dumps({
                            'title': title,
                            'ents': ent_list
                        }))
        time.sleep(3)

    if context != []:
        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
def get_highest_grossing_franchises():

    tbs = wikitables.import_tables(
        'List of best-selling video game franchises')
    categories = {
        0: '100 millions',
        1: '50 millions',
        2: '20 millions',
        3: '10 millions',
        4: '5 millions'
    }

    games = []
    grossing_category = []

    for i, tb in enumerate(tbs):
        fn = [row.get('Franchise name') for row in tb.rows][::2]
        games.extend(fn)
        grossing_category.extend([categories.get(i)] * len(fn))

    df = pd.DataFrame.from_dict({
        'franchise': games,
        'copies_sold': grossing_category,
    })
    return df
Beispiel #3
0
def main():
    parser = ArgumentParser(description='wikitables v%s' % version)
    parser.add_argument('-l','--lang',
                        dest='lang',
                        help='article language (default: %(default)s)',
                        default='en')
    parser.add_argument('-p','--pretty',
                        action='store_true',
                        help='pretty-print json output')
    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help='enable debug output')
    parser.add_argument('article', help='article title')

    args = parser.parse_args()

    if not args.article:
        print('usage: wikitables <article title>')
        sys.exit(1)

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
        log.debug('debug logging enabled')
    else:
        logging.basicConfig(level=logging.WARN)

    from wikitables import import_tables
    from wikitables.util import jprint

    tables = import_tables(args.article, lang=args.lang)
    d = { t.name:t.rows for t in tables }
    if args.pretty:
        jprint(d)
    else:
        print(json.dumps(d, cls=TableJSONEncoder))
Beispiel #4
0
def get_djia_tickers():
    tables = import_tables("Dow Jones Industrial Average")
    data = pd.DataFrame(tables[0].rows)
    data = data.astype(str)
    str_cols = data.select_dtypes(['object'])
    data[str_cols.columns] = str_cols.apply(lambda x: x.str.strip())

    return data.reset_index()[['Symbol', 'Industry']]
Beispiel #5
0
def get_sp500_tickers():
    """
    Get financial descriptors for S&P500 stocks
    Returns:
        dict(dict(pd.DataFrame)): the descriptors by stocks and categories
    """

    tables = import_tables("List of S&P 500 companies")
    data = pd.DataFrame(tables[0].rows)
    data = data.astype(str)
    str_cols = data.select_dtypes(['object'])
    data[str_cols.columns] = str_cols.apply(lambda x: x.str.strip())

    return data.reset_index()[['Ticker symbol', 'GICS Sector']]
Beispiel #6
0
def find_event_links():
    """Find url links to all events"""

    events = import_tables("List of UFC events")[1]
    event_links = []
    for event in events.rows:
        name = str(event["Event"])
        # Wikipedia markup links are either [link] or [link|text]
        link = re.findall(r"\[\[(.*?)\|", name)
        if (link):
            event_links.append(link[0])
            continue

        link = re.findall(r"\[\[(.*?)\]\]", name)
        if (link):
            event_links.append(link[0])
            continue

        event_links.append(name)

    return event_links
Beispiel #7
0
def main():
    parser = ArgumentParser(description='wikitables v%s' % version)
    parser.add_argument('-l',
                        '--lang',
                        dest='lang',
                        help='article language (default: %(default)s)',
                        default='en')
    parser.add_argument('-p',
                        '--pretty',
                        action='store_true',
                        help='pretty-print json output')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        help='enable debug output')
    parser.add_argument('article', help='article title')

    args = parser.parse_args()

    if not args.article:
        print('usage: wikitables <article title>')
        sys.exit(1)

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
        log.debug('debug logging enabled')
    else:
        logging.basicConfig(level=logging.WARN)

    from wikitables import import_tables
    from wikitables.util import jprint

    tables = import_tables(args.article, lang=args.lang)
    d = {t.name: t.rows for t in tables}
    if args.pretty:
        jprint(d)
    else:
        print(json.dumps(d, cls=TableJSONEncoder))
from wikitables import import_tables
import json
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
tables = import_tables('List_of_people_who_died_climbing_Mount_Everest', 'en')
t = tables[0].json()
with open('everest-deaths.txt', 'w') as outfile:
    json.dump(t, outfile)
import wikipedia
from wikitables import import_tables

tables = import_tables(
    'List of cities in Italy')  #returns a list of WikiTable objects

#
# def wiki_search(query):
#
#     r = requests.get(API_URL, params=params, headers=headers)
#
#     raw_results = _wiki_request(search_params)


def wiki_lookup(answer):
    try:
        ny = wikipedia.page(answer)
    except:
        return ""
    return ny


# question = "Which of these is a common material used in 3D printers"
# wiki_lookup("3D printers")

# def get_wikis(answers):
#     answer_wikis = {}
#     for answer in answers:
#         answer_wiki = wiki_lookup(answer)
#         answer_wikis[answer] = answer_wiki
#
Beispiel #10
0
from wikitables import import_tables
import wptools
import wikipedia
import json
import requests

allTeamsTable = import_tables('List_of_Formula_One_constructors')

teams = []
for row in allTeamsTable[0].rows:
    teams.append(({
        "name": '{Constructor}'.format(**row),
        "engine": '{Engine}'.format(**row),
        "from": '{Based in}'.format(**row),
        "seasons": '{Seasons}'.format(**row),
        "race_entries": '{Races Entered}'.format(**row),
        "race_start": '{Races Started}'.format(**row),
        "drivers": '{Drivers}'.format(**row),
        "total_entries": '{Total Entries}'.format(**row),
        "wins": '{Wins}'.format(**row),
        "points": '{Points}'.format(**row),
        "poles": '{Poles}'.format(**row),
        "fastest_laps": '{FL}'.format(**row),
        "wcc": '{WCC}'.format(**row),
        "wdc": '{WDC}'.format(**row),
        "related_teams": '{Antecedent teams}'.format(**row),
        "active": True,
    }))

for row in allTeamsTable[1].rows:
    teams.append(({
Beispiel #11
0
def main(in_directory):
    # google maps api key
    gmaps_api = 'AIzaSyAm4vrzQVlfDxV2L13PuWgHEiwVaS8IPHc'

    # load osm data
    osmpath = os.path.join(in_directory, 'amenities-vancouver.json')
    osm_data = pd.read_json(osmpath, lines=True)

    # load wikidata
    #wikiclient = Client()
    #wikient = wikiclient.get('Q3472954', load=True)

    # load wikipedia data
    wiki = wk.Wikipedia('en')
    can_chains = wiki.page('List of Canadian restaurant chains')
    can_chains = pd.DataFrame(
        {'restaurant_name': can_chains.sections[0].sections})
    can_chains = can_chains.applymap(lambda x: x.title)

    wiki_table = import_tables(
        'List of restaurant chains in the United States')
    us_chains = pd.DataFrame(wiki_table[0].rows)

    # necessary loop due to wikitable library
    for table in wiki_table[1:29]:
        us_chains = us_chains.append(pd.DataFrame(table.rows), sort=False)

    us_chains = us_chains.reset_index()

    # gather restaurant and fast food osm data
    osm_rest = osm_data[osm_data['amenity'].isin(['restaurant', 'fast_food'])]

    # drop any restaurants without names
    osm_rest = osm_rest.dropna(subset=['name'])

    # detect chains utilizing multiple methods
    # first method, find restaurants that occur multiple times in osm data
    osm_chains = osm_rest[osm_rest.duplicated(['name'])]
    osm_indep = osm_rest[~osm_rest.duplicated(['name'])]

    # second method, include restaurants in list of US and CAN chains
    osm_chains_can = osm_rest[osm_rest['name'].isin(
        can_chains['restaurant_name'])]
    osm_chains_us = osm_rest[osm_rest['name'].isin(us_chains['Name'])]

    # combine lists
    osm_chains_can = pd.concat(
        [osm_chains_can, osm_chains_us],
        sort='True')  #.drop_duplicates().reset_index(drop=True)
    osm_chains_can = osm_chains_can.drop_duplicates(
        subset=['lat', 'lon']).reset_index()

    # combine methods
    osm_chains = pd.concat(
        [osm_chains, osm_chains_can],
        sort='True')  #.drop_duplicates().reset_index(drop=True)
    osm_chains = osm_chains.drop_duplicates(
        subset=['lat', 'lon']).reset_index()

    # segment osm data into grid
    # find bounds of grid
    ymin = osm_rest['lat'].min()
    ymax = osm_rest['lat'].max()
    xmin = osm_rest['lon'].min()
    xmax = osm_rest['lon'].max()

    # build grid
    grid_density = 120
    x_grid = np.linspace(xmin, xmax, grid_density * 1.3)
    y_grid = np.linspace(ymin, ymax, grid_density)

    dens_rest, x_edges, y_edges = np.histogram2d(osm_rest['lon'].values,
                                                 osm_rest['lat'].values,
                                                 bins=[x_grid, y_grid])
    dens_chain, _, _ = np.histogram2d(osm_chains['lon'].values,
                                      osm_chains['lat'].values,
                                      bins=[x_grid, y_grid])
    dens_indep, _, _ = np.histogram2d(osm_indep['lon'].values,
                                      osm_indep['lat'].values,
                                      bins=[x_grid, y_grid])

    dens_rest = dens_rest.T
    dens_chain = dens_chain.T
    dens_indep = dens_indep.T

    # density of chains over independents
    dens_chain_vs_indep = dens_chain - dens_indep

    dens_chain_vs_indep[dens_chain_vs_indep == 0] = np.nan

    x_mesh, y_mesh = np.meshgrid(x_edges, y_edges)

    # plot heatmap
    fig, ax = plt.subplots()
    ax.set_axis_off()

    # setup separate colors for locations with more chains vs independents
    min_dens = np.nanmin(dens_chain_vs_indep)
    max_dens = np.nanmax(dens_chain_vs_indep)
    color_chain = plt.cm.coolwarm(np.linspace(0.15, 0, max_dens))
    color_indep = plt.cm.coolwarm(np.linspace(0.5, 0.95, abs(min_dens)))
    color_chain_indep = np.vstack((color_indep, color_chain))
    color_chain_indep = colors.LinearSegmentedColormap.from_list(
        'color_chain_indep', color_chain_indep)

    # seperate colormap for chains and independent restaurants
    divnorm = colors.TwoSlopeNorm(vmin=min_dens, vcenter=0, vmax=max_dens)

    # plot heatmap
    ax.pcolormesh(x_mesh,
                  y_mesh,
                  dens_chain_vs_indep,
                  cmap=color_chain_indep,
                  norm=divnorm)

    # save heatmap
    img_bound = ax.get_window_extent().transformed(
        fig.dpi_scale_trans.inverted())
    fig.savefig('chainvsindep_heatmap.png',
                format='png',
                transparent=True,
                bbox_inches=img_bound,
                pad_inches=0)

    # overlap heatmap on google map
    lat_mid = np.mean(osm_rest['lat'])
    lon_mid = np.mean(osm_rest['lon'])

    gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api)

    # adjust for grid spacing
    grid_size = grid_density / (grid_density - 1)
    north = (ymax - lat_mid) * grid_size + lat_mid
    south = (ymin - lat_mid) * grid_size + lat_mid
    east = (xmax - lon_mid) * grid_size + lon_mid
    west = (xmin - lon_mid) * grid_size + lon_mid

    bounds = {'north': north, 'south': south, 'east': east, 'west': west}
    gmap.ground_overlay('chainvsindep_heatmap.png', bounds, opacity=0.8)

    gmap.draw('map_chainvsindep.html')

    # heatmap for chains and independts separately on two maps
    gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api)

    gmap.heatmap(osm_chains['lat'].values, osm_chains['lon'].values)
    gmap.draw('map_chain.html')

    gmap.heatmap(osm_indep['lat'].values, osm_indep['lon'].values)
    gmap.draw('map_indep.html')

    ############################################################
    #### heatmap zoom in ###
    grid_density = 300
    x_grid = np.linspace(xmin, xmax, grid_density * 1.5)
    y_grid = np.linspace(ymin, ymax, grid_density)

    dens_rest, x_edges, y_edges = np.histogram2d(osm_rest['lon'].values,
                                                 osm_rest['lat'].values,
                                                 bins=[x_grid, y_grid])
    dens_chain, _, _ = np.histogram2d(osm_chains['lon'].values,
                                      osm_chains['lat'].values,
                                      bins=[x_grid, y_grid])
    dens_indep, _, _ = np.histogram2d(osm_indep['lon'].values,
                                      osm_indep['lat'].values,
                                      bins=[x_grid, y_grid])

    dens_rest = dens_rest.T
    dens_chain = dens_chain.T
    dens_indep = dens_indep.T

    # density of chains over independents
    dens_chain_vs_indep = dens_chain - dens_indep

    dens_chain_vs_indep[dens_chain_vs_indep == 0] = np.nan

    x_mesh, y_mesh = np.meshgrid(x_edges, y_edges)

    # plot heatmap
    fig, ax = plt.subplots()
    ax.set_axis_off()

    # setup separate colors for locations with more chains vs independents
    min_dens = np.nanmin(dens_chain_vs_indep)
    max_dens = np.nanmax(dens_chain_vs_indep)
    color_chain = plt.cm.coolwarm(np.linspace(0.15, 0, max_dens))
    color_indep = plt.cm.coolwarm(np.linspace(0.5, 0.95, abs(min_dens)))
    color_chain_indep = np.vstack((color_indep, color_chain))
    color_chain_indep = colors.LinearSegmentedColormap.from_list(
        'color_chain_indep', color_chain_indep)

    # seperate colormap for chains and independent restaurants
    divnorm = colors.TwoSlopeNorm(vmin=min_dens, vcenter=0, vmax=max_dens)

    # plot heatmap
    ax.pcolormesh(x_mesh,
                  y_mesh,
                  dens_chain_vs_indep,
                  cmap=color_chain_indep,
                  norm=divnorm)

    # save heatmap
    img_bound = ax.get_window_extent().transformed(
        fig.dpi_scale_trans.inverted())
    fig.savefig('chainvsindep_heatmap_zoom.png',
                format='png',
                dpi=1000,
                transparent=True,
                bbox_inches=img_bound,
                pad_inches=0)

    # overlap heatmap on google map
    lat_mid = np.mean(osm_rest['lat'])
    lon_mid = np.mean(osm_rest['lon'])

    gmap = gmplot.GoogleMapPlotter(lat_mid, lon_mid, 11, apikey=gmaps_api)

    # adjust for grid spacing
    grid_size = grid_density / (grid_density - 1)
    north = (ymax - lat_mid) * grid_size + lat_mid
    south = (ymin - lat_mid) * grid_size + lat_mid
    east = (xmax - lon_mid) * grid_size + lon_mid
    west = (xmin - lon_mid) * grid_size + lon_mid

    bounds = {'north': north, 'south': south, 'east': east, 'west': west}
    gmap.ground_overlay('chainvsindep_heatmap_zoom.png', bounds, opacity=0.8)

    gmap.draw('map_chainvsindep_zoom.html')
Beispiel #12
0
from wikitables import import_tables
import wikipedia
import json
import requests
print("Hello, world!")

allDriversTable = import_tables('List of Formula One drivers')

WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='


def get_wiki_image(search_term):
    try:
        result = wikipedia.search(search_term, results=1)
        wikipedia.set_lang('en')
        wkpage = wikipedia.WikipediaPage(title=result[0])
        title = wkpage.title
        response = requests.get(WIKI_REQUEST + title)
        json_data = json.loads(response.text)
        img_link = list(
            json_data['query']['pages'].values())[0]['original']['source']
        return img_link
    except:
        return 0


drivers = []


def show_todo():
    for row in allDriversTable[1].rows: