Beispiel #1
0
def normalize_etpinard_df(df='https://plot.ly/~etpinard/191.csv', columns='x y size text'.split(),
                          category_col='category', possible_categories=['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']):
    """Reformat a dataframe in etpinard's format for use in plot functions and sklearn models"""
    possible_categories = ['Africa', 'Americas', 'Asia', 'Europe', 'Oceania'] if possible_categories is None else possible_categories
    df.columns = clean_columns(df.columns)
    df = pd.read_csv(df) if isinstance(df, str) else df
    columns = clean_columns(list(columns))
    df2 = pd.DataFrame(columns=columns)
    df2[category_col] = np.concatenate([np.array([categ] * len(df)) for categ in possible_categories])
    columns = zip(columns, [[clean_columns(categ + ', ' + column) for categ in possible_categories] for column in columns])
    for col, category_cols in columns:
        df2[col] = np.concatenate([df[label].values for label in category_cols])
    return df2
Beispiel #2
0
def download(names=None, verbose=True):
    """ Download CSV or HTML tables listed in `names` and save them to DATA_PATH/`names`.csv

    Uses table in data_info.csv (internal DATA_INFO) to determine URL or file path from dataset name.
    Also looks

    TODO: if name is a valid URL then download it and create a name
          and store the name: url in data_info.csv before downloading
    """
    names = [names] if isinstance(names, basestring) else names
    # names = names or list(BIG_URLS.keys())  # download them all, if none specified!
    file_paths = {}
    for name in names:
        name = name.lower().strip()
        if name in BIG_URLS:
            file_paths[name] = download_file(BIG_URLS[name][0],
                                             data_path=BIGDATA_PATH,
                                             size=BIG_URLS[name][1],
                                             verbose=verbose)
            if file_paths[name].endswith('.tar.gz'):
                logger.info('Extracting {}'.format(file_paths[name]))
                untar(file_paths[name])
                file_paths[name] = file_paths[
                    name][:
                          -7]  # FIXME: rename tar.gz file so that it mimics contents
            if file_paths[name].endswith('.zip'):
                file_paths[name] = unzip(file_paths[name])
        else:
            df = pd.read_html(DATA_INFO['url'][name],
                              **DATA_INFO['downloader_kwargs'][name])[-1]
            df.columns = clean_columns(df.columns)
            file_paths[name] = os.path.join(DATA_PATH, name + '.csv')
            df.to_csv(file_paths[name])
    return file_paths
def get_data(name='food_carbon', url=None):
    """ Retrieve data from local cache in data/ or download from url

    >>> get_data('food_carbon').shape
    (16, 4)
    """
    name = make_name(name)  # replace spaces with _ ,etc)
    if name == 'food_carbon':
        df = pd.read_html(
            'http://www.greeneatz.com/foods-carbon-footprint.html',
            header=0)[0]
        df.to_csv('food_carbon.csv')
    elif name == 'capitals':
        df = pd.read_html(
            'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States',
            header=0)[0]
    df.columns = clean_columns(df.columns)
    return df
Beispiel #4
0
def download_unzip(names=None, verbose=True):
    """ Download CSV or HTML tables listed in `names`, unzip and to DATA_PATH/`names`.csv .txt etc

    Also normalizes file name extensions (.bin.gz -> .w2v.bin.gz).
    Uses table in data_info.csv (internal DATA_INFO) to determine URL or file path from dataset name.
    Also looks

    TODO: if name is a valid URL then download it and create a name
          and store the name: url in data_info.csv before downloading
    """
    names = [names] if isinstance(names, (str, basestring)) else names
    # names = names or list(BIG_URLS.keys())  # download them all, if none specified!
    file_paths = {}
    for name in names:
        name = name.lower().strip()
        if name in BIG_URLS:
            file_paths[name] = download_file(BIG_URLS[name][0],
                                             data_path=BIGDATA_PATH,
                                             size=BIG_URLS[name][1],
                                             verbose=verbose)
            if file_paths[name].lower().endswith('.tar.gz'):
                logger.info('Extracting {}'.format(file_paths[name]))
                file_paths[name] = untar(file_paths[name])
            if file_paths[name].lower().endswith('.zip'):
                file_paths[name] = unzip(file_paths[name])
                logger.debug('download_unzip.filepaths=' + str(file_paths))
        else:
            df = pd.read_html(DATA_INFO['url'][name],
                              **DATA_INFO['downloader_kwargs'][name])[-1]
            df.columns = clean_columns(df.columns)
            file_paths[name] = os.path.join(DATA_PATH, name + '.csv')
            df.to_csv(file_paths[name])
        logger.debug('download_unzip.filepaths=' + str(file_paths))
        new_file_paths = normalize_ext(file_paths[name])
        logger.debug('download_unzip.new_filepaths=' + str(new_file_paths))
        file_paths[name] = rename_file(file_paths[name], new_file_paths)
        logger.debug('download_unzip.filepaths=' + str(file_paths))
    return file_paths
Beispiel #5
0
def offline_plotly_scatter_bubble(
    df,
    x='x',
    y='y',
    size_col='size',
    text_col='text',
    category_col='category',
    possible_categories=None,
    filename='offline_plotly_scatter_bubble.html',
    config={'displaylogo': False},
    xscale=None,
    yscale='log',
    layout={
        'hovermode': 'closest',
        'showlegend': False,
        'autosize': True
    },
    marker={'sizemode': 'area'},
):
    """Interactive scatterplot of a DataFrame with the size and color of circles linke to two columns

    config keys:
      fillFrame setBackground displaylogo sendData showLink linkText staticPlot scrollZoom plot3dPixelRatio displayModeBar
      showTips workspace doubleClick autosizable editable

    layout keys:
      angularaxis annotations autosize bargap bargroupgap barmode barnorm boxgap boxgroupgap boxmode calendar
      direction dragmode font geo height hiddenlabels hiddenlabelssrc hidesources hovermode images legend
      mapbox margin orientation paper_bgcolor plot_bgcolor radialaxis scene separators shapes showlegend sliders smith
      ternary title titlefont updatemenus width xaxis yaxis

    marker keys:
      autocolorscale blend border cauto cmax cmin color colorbar colors colorscale colorsrc colorssrc line maxdisplayed
      opacity opacitysrc outliercolor reversescale showscale size sizemax sizemin sizemode sizeref sizesrc symbol symbolsrc

    marker['sizeref'] gives the denominator of the circle scaling factor.
      Typically it should be about a tenth of the minimum 'size' column value

    >>> from nlpia.data.loaders import get_data
    >>> from nlpia.plots import offline_plotly_scatter_bubble
    >>> df = get_data('cities_us_wordvectors_pca2_meta')
    >>> html = offline_plotly_scatter_bubble(
    ...     df.sort_values('population', ascending=False)[:350].copy().sort_values('population'),
    ...     filename='plotly_scatter_bubble.html',
    ...     x='x', y='y',
    ...     size_col='population', text_col='name', category_col='timezone',
    ...     xscale=None, yscale=None,  # 'log' or None
    ...     layout={}, marker={'sizeref': 3000})
    """
    config_default = dict(DEFAULT_PLOTLY_CONFIG)
    marker_default = {
        'size': size_col,
        'sizemode': 'area',
        'sizeref': int(df[size_col].min() * .8)
    }
    marker_default.update(marker)
    size_col = marker_default.pop('size')
    layout_default = {
        'xaxis': graph_objs.XAxis(title=x, type=xscale),
        'yaxis': graph_objs.YAxis(title=y, type=yscale),
    }
    layout_default.update(**layout)
    if config is not None:
        config_default.update(config)
    df.columns = clean_columns(df.columns)
    if possible_categories is None and category_col is not None:
        if category_col in df.columns:
            category_labels = df[category_col]
        else:
            category_labels = np.array(category_col)
        possible_categories = list(set(category_labels))
    possible_categories = [
        None
    ] if possible_categories is None else possible_categories
    if category_col in df:
        masks = [
            np.array(df[category_col] == label)
            for label in possible_categories
        ]
    else:
        masks = [np.array([True] * len(df))] * len(possible_categories)
    print(marker_default)
    data = {
        'data': [
            graph_objs.Scatter(x=df[x][mask].values,
                               y=df[y][mask].values,
                               text=df[text_col][mask].values,
                               marker=graph_objs.Marker(
                                   size=df[size_col][mask]
                                   if size_col in df.columns else size_col,
                                   **marker_default),
                               mode='markers',
                               name=str(category_name))
            for (category_name, mask) in zip(possible_categories, masks)
        ],
        'layout':
        graph_objs.Layout(**layout_default)
    }
    return offline_plotly_data(data, filename=filename, config=config_default)
Beispiel #6
0
columns
capitals = capitals2
columns[2] = 'statehood_year'
capitals
capitals.columns = columns
capitals
capitals['city'] = capitals.capital
capitals['population'] = capitals.population.astype(int)
capitals['population'] = capitals.population_2010.astype(int)
capitals['capital_since'] = capitals.capital_since.astype(int)
capitals['population_2010_metro'] = capitals.notes.astype(int)
capitals['population_2010_metro'] = capitals.notes.astype(float)
capitals['population_2010_rank'] = capitals.unnamed_8.astype(int)
''.join(re.findall('\w', '\thello_w orld& '))
capitals
capitals.columns = clean_columns(capitals.columns)
%paste
capitals.columns = clean_columns(capitals.columns)
%paste
capitals.columns = clean_columns(capitals.columns)
capitals.columns
capitals.unnamed_8
capitals.unnamed_8 == capitals.population_2010_rank
(capitals.unnamed_8 == capitals.population_2010_rank).all()
del capitals['unnamed_8']
capitals.population_2010_rank
!pwd
!whoami
!uname
hist
cities