Beispiel #1
0
 def __from_pandas_data_frame__(cls, df: PandasDataFrameBase, user: str, dataset: str, table: str) -> "DataDotWorldTable":
     with dw.open_remote_file(
         "{0}/{1}".format(user, dataset), "{0}.csv".format(table)
     ) as w:
         print(df.inner_data)
         df.inner_data.to_csv(w, index=False)
     uri = "dw://{0}/{1}/{2}".format(user, dataset, table)
     return DataDotWorldTable.from_uri(uri, source=df)
Beispiel #2
0
def upload(set_name, emb_path="", metadata={}, summary="", sep=","):
    '''Upload a new embedding or update files and associated metadata.

	Args:
		set_name (str): Name of the dataset being created (format: owner/id)
		emb_path (str): Absolute path to local embedding
		metadata (dict, opt): Dictionary in the format '{metadata field: value}'
		summary (str, opt): Optional description of embedding and source

	Returns: None (Create a new/updated data.world dataset with the shared embedding)
	'''
    dw_api = dw.api_client()
    set_name = set_name.replace(' ', '-').replace('_', '-')
    metadata_str, dimensions, app_num = "", 0, 0
    usr_name, title = set_name.split("/")
    emb_name = os.path.basename(emb_path)

    for key, val in metadata.items():
        metadata_str += str(key) + ":" + str(val) + ", "

    with io.open(emb_path, 'r', encoding='utf-8') as f:
        first_row = f.readline().split(sep)
    header = ['text']
    header.extend([u"d" + str(n) for n in range(len(first_row) - 1)])

    if os.path.getsize(emb_path) > 1E9 or True:
        emb_reader = pd.read_csv(emb_path,
                                 chunksize=4E5,
                                 names=header,
                                 encoding='utf-8',
                                 sep=sep)
        index_df = pd.DataFrame()
        for app_num, emb_chunk in enumerate(emb_reader):
            app_title = emb_name[:-4].lower().replace(' ', '-').replace(
                '_', '-') + "-appx" + str(app_num)
            app_setname = usr_name + "/" + app_title
            app_fname = app_title + ".csv"

            words = emb_chunk.ix[:, 0].reset_index(drop=True)
            app_sets = pd.Series(app_setname,
                                 index=np.arange(len(emb_chunk)),
                                 name="app_setname")
            app_file = pd.Series(app_fname,
                                 index=np.arange(len(emb_chunk)),
                                 name="app_fname")

            tmp_df = pd.concat((words, app_sets, app_file), axis=1, copy=False)
            index_df = index_df.append(tmp_df, ignore_index=True)
            emb_chunk = emb_chunk.round(4)
            try:
                dw_api.create_dataset(usr_name, title = app_title, description = summary,\
                         license = 'Public Domain', tags = ['vecshare appx'], visibility = 'OPEN')
            except:
                dw_api.update_dataset(app_setname, description=summary)
            with dw.open_remote_file(app_setname, app_fname, mode='wb') as app:
                emb_chunk.to_csv(app, index=False, mode='wb', encoding='utf-8')
        try:
            metadata_str += "app_num:" + str(app_num + 1) + ",vs_format:large"
            dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\
            license = 'Public Domain', tags = ['vecshare large'], visibility = 'OPEN')
        except:
            dw_api.update_dataset(
                usr_name + '/' +
                title.lower().replace(' ', '-').replace('_', '-'),
                summary=metadata_str,
                description=summary)
        with dw.open_remote_file(set_name.lower().replace(' ', '-').replace(
                '_', '-'),
                                 emb_name,
                                 mode='wb') as index:
            index_df.to_csv(index, index=False, mode='wb', encoding='utf-8')
    else:
        emb = pd.read_csv(emb_path, names=header, encoding='utf-8', sep=sep)
        try:
            metadata_str += "app_num:" + str(1) + ",vs_format:small"
            dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\
            license = 'Public Domain', tags = ['vecshare small'], visibility = 'OPEN')
        except:
            dw_api.update_dataset(set_name,
                                  summary=metadata_str,
                                  description=summary)
        with dw.open_remote_file(set_name, emb_name, mode='wb') as index:
            index_df.to_csv(index, index=False, mode='wb', encoding='utf-8')
import datadotworld as dw
import csv
"""
    Script for generating data.
    Here, you should use your own script for generating data.
"""

# Downloading official dataworld dataset example
with dw.open_remote_file(
        'jonloyens/an-intro-to-dataworld-dataset',
        'DataDotWorldBBallStats.csv',
        mode='r')\
        as r:

    reader = csv.reader(r)
    output_file = open("data/DataDotWorldBBallStats.csv", "wb")
    writer = csv.writer(output_file,
                        delimiter=",",
                        quotechar='"',
                        quoting=csv.QUOTE_ALL)

    for row in reader:
        writer.writerow(row)

    output_file.close()
Beispiel #4
0
import datadotworld as dw
import csv
import os
"""
    Script for uploading your local changes to data.world
"""

with dw.open_remote_file(
        'https://data.world/fran1307/2018-datadotworldbballstats',
        'DataDotWorldBBallStats.csv')\
        as w:

    input_file = open("data/DataDotWorldBBallStats.csv", "r")
    reader = csv.reader(input_file)
    writer = csv.writer(w, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL)

    for row in reader:
        writer.writerow(row)

    input_file.close()
Beispiel #5
0
 def _get_csv_data(self):
     with dw.open_remote_file(self.dataset, self.filename, mode='r') as r:
         return csv.DictReader(r)        
def upload(data):
    if not config['dw_token'] and not os.environ.get('DW_AUTH_TOKEN', None):
        raise Exception("Expecting data.world token.")
    dataset = '{}/{}'.format(config['dw_username'], config['dw_dataset'])
    with dw.open_remote_file(dataset, 'data.csv') as w:
        w.write(data)
Beispiel #7
0
    home_table = tables[1]
    home_table

    # Print game score table.
    game_score

    # Clean URL to pull team vs team match name and create varaible containing appropriate names
    # per team.
    cleaned_url = url.split('/d1/')[1].replace('/', "_").replace('-', '_')

    home_csv = cleaned_url + '_' + home_team + '.csv'
    away_csv = cleaned_url + '_' + away_team + '.csv'
    game_csv = cleaned_url + '_game_results' + '.csv'

    # To use these remove to + .csv from the variables above.
    #home_table.to_csv('game_htmls/' + home_csv + '.csv')
    #away_table.to_csv('game_htmls/' + away_csv + '.csv')
    #game_score.to_csv('game_htmls/' + game_csv + '.csv')

    # Close browser.
    browser.quit()
    # Write csv's to Data.World.
    with dw.open_remote_file('rustygentile/ncaa-etl-2018', home_csv) as w:
        home_table.to_csv(w, index=False)

    with dw.open_remote_file('rustygentile/ncaa-etl-2018', away_csv) as w:
        away_table.to_csv(w, index=False)

    with dw.open_remote_file('rustygentile/ncaa-etl-2018', game_csv) as w:
        game_score.to_csv(w, index=False)
Beispiel #8
0
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datadotworld as dw

bracket_url = 'https://www.ncaa.com/brackets/basketball-men/d1/2018'
bracket_html = requests.get(bracket_url)

# Soup setup
soup = BeautifulSoup(bracket_html.text, 'html.parser')

# Get all the urls
urls = []
for a in soup.find_all('a', href=True):
    urls.append(a['href'])

# Filter out the games and export
base_game_url = "https://www.ncaa.com"
game_urls = []
for url in urls:
    if '/game/basketball-men/' in url:
        game_urls.append(f'{base_game_url}{url}')

game_urls_df = pd.DataFrame({'urls': game_urls})

with dw.open_remote_file('rustygentile/ncaa-etl', 'game_urls_2018.csv') as w:
    game_urls_df.to_csv(w, index=False)