def deleteDataWorld(tbl_def):
    '''
    Removes table from data.world
    tbl_def is { "owner_id": DW_USER, 
                     "dw_title": table_name, 
                     "gh_url": GH_URL + table_name, 
                     "visibility": "OPEN", 
                     "license": "Public Domain",
                     "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}},
                     "dw_url": DW_DB_URL + table_name + '.csv',
                     "dataset_id": DW_USER + "/" + table_name
                    }
    '''

    dw.api_client().delete_dataset(tbl_def["dw_dataset_id"])
def sync_dataset(DATASET_URL=DATASET_URL):
    sys.stdout.write("\n> Syncing files at: https://data.world/" +
                     DATASET_URL + " -> ")
    with Spinner():
        api_client = dw.api_client()
        api_client.sync_files(DATASET_URL)
        print("\n")
Exemple #3
0
def upload(set_name, emb_path="", metadata={}, summary=None):
    '''Upload a new embedding or update files and associated metadata.

	Args:
		set_name (str): Name of the dataset being created (format: owner/id)
		emb_path (str): Absolute path to local embedding
		metadata (dict, opt): Dictionary in the format '{metadata field: value}'
		summary (str, opt): Optional description of embedding and source

	Returns: None (Create a new/updated data.world dataset with the shared embedding)
	'''
    if os.path.getsize(emb_path) > 1000000000:
        raise ValueError(
            "Uploads only supported for embeddings up to 1GB. Consider reducing file size with vecshare.format()."
        )
    dw_api = dw.api_client()
    metadata_str = ""
    for key, val in metadata.items():
        metadata_str += str(key) + ":" + str(val) + ", "
    try:
        usr_name, title = set_name.split("/")
        dw_api.create_dataset(usr_name, title = title, summary = metadata_str,\
        description = summary ,license = 'Public Domain', tags = ['vecshare'], visibility = 'OPEN')
    except:
        dw_api.update_dataset(set_name,
                              summary=metadata_str,
                              description=summary)

    if emb_path:
        dw_api.upload_files(set_name, [emb_path])
Exemple #4
0
def send_to_dw(doc):

    client = dw.api_client()

    username = '******'

    title = doc.find_first_value("Root.Title")
    key = join(username, slugify(title))

    d = dict(title=doc.find_first_value("Root.Title"),
             description=doc.find_first_value("Root.Description"),
             summary=doc.markdown,
             visibility='OPEN',
             files=get_resource_urls(doc))

    try:

        ds = client.get_dataset(
            key)  # Raise an error if the dataset does not exist

        ds = client.replace_dataset(key, **d)

        ds = client.get_dataset(key)

    except RestApiError:

        ds = client.create_dataset('ericbusboom', **d)

        ds = client.get_dataset(key)
Exemple #5
0
def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100):
    '''
    If there are changes to the set of shared embeddings, refresh the AvgRank signature.

    Generate a set of at most `stopword_cnt` stopwords that occur in at least
    `tolerance` * emb_cnt embeddings. Generate signatures for the embeddings
    using the `sig_cnt` most common remaining words.

    Args:
        tolerance (float): Frequency at which a stopword must occur
        sig_cnt (int): Size of AvgRank signature vocab_size
        stopword_cnt (int): Max number of stopwords

    Returns:
        None. Uploads new ar_sig.txt (serialized signatures) to data store.
    '''
    stopwords, emb_vocab, signatures = [],{}, {}
    DW_API_TOKEN = os.environ['DW_AUTH_TOKEN']

    #emb_list = dw.query(info.INDEXER, 'SELECT embedding_name, dataset_name FROM ' + info.INDEX_FILE).dataframe
    emb_list = pd.read_csv(info.INDEX_FILE_PATH)
    threshold = int(0.5 + tolerance * emb_list.shape[0])
    for ind, row in emb_list.iterrows():
        if row['vs_format'] == 'large':
            emb_name, set_name = row['embedding_name']+"-appx0", row['dataset_name']+"-appx0"
        else:
            emb_name, set_name = row['embedding_name'], row['dataset_name']
        query_url = "https://query.data.world/file_download/"+set_name+"/"+ emb_name + '.csv'
        payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN}
        emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text)
        emb_df = pd.read_csv(emb_text, nrows = 1.5 *sig_cnt)

        wordlist = emb_df.iloc[0:2*stopword_cnt,0].values
        signatures.update({emb_name: emb_df.iloc[:,0].values})
        for word in wordlist:
            word = str(word).lower()
            if (word not in emb_vocab): emb_vocab.update({word: 1})
            else:emb_vocab[word] += 1

    stopwords.extend(list(string.digits))
    stopwords.extend(list(string.punctuation))
    for key in emb_vocab:
        if (emb_vocab[key] >= threshold): stopwords.append(key)

    for emb_name, emb_sig  in signatures.items():
        emb_sig = emb_sig.tolist()
        for word in stopwords:
            if word in emb_sig: emb_sig.remove(word)
        emb_sig = emb_sig[:sig_cnt]
        print ("Generated AvgRank signature for: " + emb_name)
        signatures.update({emb_name:emb_sig})
    signatures.update({'stopwords':stopwords})

    pickle.dump(signatures, io.open(info.AR_SIG_PATH, "wb"))
    dw_api  = dw.api_client()
    print ("Uploading AvgRank signatures")
    dw_api.upload_files(info.SIGNATURES, info.AR_SIG_PATH)
Exemple #6
0
def updateDataWorld():
    '''
        Takes a csv file and imports it into dataworld
        tbl_def is { "owner_id": DW_USER, 
                     "dw_title": table_name, 
                     "gh_url": GH_URL + table_name, 
                     "visibility": "OPEN", 
                     "license": "Public Domain",
                     "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}},
                     "dw_url": DW_DB_URL + table_name + '.csv',
                     "dw_dataset_key": DW_USER + "/" + table_name
                    }
                    
    '''
    # owner_id=tbl_def["owner_id"],
    dw.api_client().update_dataset(title=tbl_def["dw_title"],
                                   visibility=tbl_def["visibility"],
                                   license=tbl_def['license'],
                                   files=tbl_def["files"])
def loadDataWorld(tbl_def):
    '''
        Takes a csv file and imports it into dataworld
        tbl_def is { "owner_id": DW_USER, 
                     "dw_title": table_name, 
                     "gh_url": GH_URL + table_name, 
                     "visibility": "OPEN", 
                     "license": "Public Domain",
                     "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}},
                     "dw_url": DW_DB_URL + table_name + '.csv' 
                    }
                    
    '''
    # api_client.create_dataset(
    dw.api_client().create_dataset(owner_id=tbl_def["owner_id"],
                                   title=tbl_def["dw_title"],
                                   description=tbl_def["dw_desc"],
                                   visibility=tbl_def["visibility"],
                                   license=tbl_def['license'],
                                   files=tbl_def["files"])
Exemple #8
0
def package_info(doc):

    client = dw.api_client()

    username = '******'

    title = doc.find_first_value("Root.Title")
    key = join(username, slugify(title))

    try:
        ds = client.get_dataset(key)
        prt(json.dumps(ds, indent=4))
    except RestApiError as e:
        err(e)
Exemple #9
0
def submit_online_status(status_list: List[models.Status]):
    """
    After online status per every endpoint has been ascertained, submit that
    information to data.world Stream API.
    """
    logger.info('Submitting results of check to data.world...')

    api_client: RestApiClient = dw.api_client()

    for status in status_list:
        # FIXME we get error 429 here and I haven't found a way to send
        #   multiple records at a time.
        time.sleep(1)

        api_client.append_records(
            dataset_key=settings.DATADOTWORLD['dataset'],
            stream_id=settings.DATADOTWORLD['status-stream'],
            body=dataclasses.asdict(status))

    return status_list
Exemple #10
0
def upload(set_name, emb_path="", metadata={}, summary="", sep=","):
    '''Upload a new embedding or update files and associated metadata.

	Args:
		set_name (str): Name of the dataset being created (format: owner/id)
		emb_path (str): Absolute path to local embedding
		metadata (dict, opt): Dictionary in the format '{metadata field: value}'
		summary (str, opt): Optional description of embedding and source

	Returns: None (Create a new/updated data.world dataset with the shared embedding)
	'''
    dw_api = dw.api_client()
    set_name = set_name.replace(' ', '-').replace('_', '-')
    metadata_str, dimensions, app_num = "", 0, 0
    usr_name, title = set_name.split("/")
    emb_name = os.path.basename(emb_path)

    for key, val in metadata.items():
        metadata_str += str(key) + ":" + str(val) + ", "

    with io.open(emb_path, 'r', encoding='utf-8') as f:
        first_row = f.readline().split(sep)
    header = ['text']
    header.extend([u"d" + str(n) for n in range(len(first_row) - 1)])

    if os.path.getsize(emb_path) > 1E9 or True:
        emb_reader = pd.read_csv(emb_path,
                                 chunksize=4E5,
                                 names=header,
                                 encoding='utf-8',
                                 sep=sep)
        index_df = pd.DataFrame()
        for app_num, emb_chunk in enumerate(emb_reader):
            app_title = emb_name[:-4].lower().replace(' ', '-').replace(
                '_', '-') + "-appx" + str(app_num)
            app_setname = usr_name + "/" + app_title
            app_fname = app_title + ".csv"

            words = emb_chunk.ix[:, 0].reset_index(drop=True)
            app_sets = pd.Series(app_setname,
                                 index=np.arange(len(emb_chunk)),
                                 name="app_setname")
            app_file = pd.Series(app_fname,
                                 index=np.arange(len(emb_chunk)),
                                 name="app_fname")

            tmp_df = pd.concat((words, app_sets, app_file), axis=1, copy=False)
            index_df = index_df.append(tmp_df, ignore_index=True)
            emb_chunk = emb_chunk.round(4)
            try:
                dw_api.create_dataset(usr_name, title = app_title, description = summary,\
                         license = 'Public Domain', tags = ['vecshare appx'], visibility = 'OPEN')
            except:
                dw_api.update_dataset(app_setname, description=summary)
            with dw.open_remote_file(app_setname, app_fname, mode='wb') as app:
                emb_chunk.to_csv(app, index=False, mode='wb', encoding='utf-8')
        try:
            metadata_str += "app_num:" + str(app_num + 1) + ",vs_format:large"
            dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\
            license = 'Public Domain', tags = ['vecshare large'], visibility = 'OPEN')
        except:
            dw_api.update_dataset(
                usr_name + '/' +
                title.lower().replace(' ', '-').replace('_', '-'),
                summary=metadata_str,
                description=summary)
        with dw.open_remote_file(set_name.lower().replace(' ', '-').replace(
                '_', '-'),
                                 emb_name,
                                 mode='wb') as index:
            index_df.to_csv(index, index=False, mode='wb', encoding='utf-8')
    else:
        emb = pd.read_csv(emb_path, names=header, encoding='utf-8', sep=sep)
        try:
            metadata_str += "app_num:" + str(1) + ",vs_format:small"
            dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\
            license = 'Public Domain', tags = ['vecshare small'], visibility = 'OPEN')
        except:
            dw_api.update_dataset(set_name,
                                  summary=metadata_str,
                                  description=summary)
        with dw.open_remote_file(set_name, emb_name, mode='wb') as index:
            index_df.to_csv(index, index=False, mode='wb', encoding='utf-8')
Exemple #11
0
import datadotworld as dw
import os

os.environ['DW_AUTH_TOKEN'] = (
    'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiJwcm9kLXVzZXItY2xpZW50OnRyaWxvZ3l'
    'lZCIsImlzcyI6ImFnZW50OnRyaWxvZ3llZDo6ZTVkMTBkNDgtODRmYy00ZTVjLTk'
    'zNTUtMGIwOGMzYjIxNGNlIiwiaWF0IjoxNTAzMTAxNDIzLCJyb2xlIjpbInVzZXJ'
    'fYXBpX3dyaXRlIiwidXNlcl9hcGlfcmVhZCJdLCJnZW5lcmFsLXB1cnBvc2UiOnR'
    'ydWV9.HpopfqxXh0VqNgb1b8tpP6G1bkr-WblRNeS3UlhF-05sSTxx1CHJgRuAjd'
    'nP8MoBIsHsysJANP27ioXqCKChgw')

url = 'trilogyed/dataviz-unit-11-hwk'
download_dir = 'Resources'

if os.path.isdir(download_dir):
    print("Resources Directory Already Exists!")
    print("Please Remove the existing Resources folder and re-run this script")
    exit()

client = dw.api_client()

print("Downloading Data...")
client.download_datapackage(url, download_dir)
print("Download Complete!")
def test_toplevel_api_client(dw_instances, profile):
    assert_that(datadotworld.api_client(),
                equal_to(dw_instances[profile].api_client))
Exemple #13
0
def refresh(force_update=False):
    '''
    Crawls for new embeddings with the tag and update the index file with new
    embedding sets, or changes to existing shared embeddings.

    Args:
        force_update(bool, opt): Hard reset, re-index ALL available embeddings.
            If False, only scrape metadata or new embedding sets.
    Returns:
        None. Uploads new index_file.csv to indexer on data store.
    '''
    # Retrieve source for data.world:vecshare search results
    display = Display(visible=0, size=(800, 600))
    display.start()
    wd= webdriver.Firefox(executable_path="/usr/bin/firefox", capabilities= {"marionette": False })

    page_num, set_count, sets = 1, 1000, []

    while set_count > len(sets):
        wd.get(info.DATASETS_URL + "?page="+str(page_num))
        try:
            WebDriverWait(wd,5).until(EC.visibility_of_element_located((By.CLASS_NAME, info.DW_CLASS_TAG)))
        except: pass
        soup    = BeautifulSoup(wd.page_source, 'lxml')
        set_txt = soup.find('h1','TopicView__headline___2_0-1').text
        set_count = [int(s) for s in set_txt.split() if s.isdigit()][0]
        sets.extend([s["href"][1:] for s in soup.find_all('a', info.DW_CLASS_TAG)])
        page_num += 1
    dw_api  = dw.api_client()
    wd.close()
    print ("Found " + str(len(sets)) + " sets with the " + info.EMB_TAG + " tag.")

    embeddings, prev_indexed, updated = [], [], False
    if not force_update:
        prev_query = dw.query(info.INDEXER, 'SELECT dataset_name, embedding_name FROM '+ info.INDEX_FILE).dataframe
        for ind, row in prev_query.iterrows():
            prev_indexed.append("/".join(row.values))

    for set_name in sets:
        curr_set  = dw.load_dataset(set_name,force_update = True) # Embedding
        curr_meta = dw_api.get_dataset(set_name)
        set_updated = parse(curr_meta['updated'])
        meta_dict = dict()
        contrib   = curr_meta["owner"]
        resources = curr_set.describe()['resources']

        summary = StringIO(curr_meta["summary"])
        for line in summary:
            for field in line.split(","):
                for sent in field.split("."):
                    try:
                        meta_field = field.split(":")
                        if len(meta_field) == 2:
                            meta_dict[meta_field[0].strip().lower().replace(" ", "_").replace("-", "_")] = meta_field[1].strip()
                    except: pass

        for each in curr_meta['files']:
            emb_name = each['name'][:-4]
            emb_updated = parse(each['updated'])
            try:
                ind_query = 'SELECT last_updated FROM '+ info.INDEX_FILE + \
                ' WHERE dataset_name = "'+ set_name +'" and embedding_name = "'+emb_name+'"'
                query_results = dw.query(info.INDEXER, ind_query).dataframe.iloc[0].values[0]
                last_indexed = parse(query_results)
                if emb_updated > set_updated: last_updated = emb_updated
                else: last_updated =  set_updated
            except:
                last_updated = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
                last_indexed = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
                pass

            # Index if new embedding or if metadata/embedding updated since last Index
            if (force_update) or (set_name + '/' + emb_name not in prev_indexed) or  (last_indexed < last_updated) :
                try: curr_emb = curr_set.describe(emb_name.lower())
                except: continue
                updated = True
                emb_dim = len(curr_emb['schema']['fields']) - 1
                file_format = curr_emb['format']
                try: vocab_size = dw.query(set_name , "SELECT COUNT(text) FROM " + emb_name).dataframe.iloc[0][0]
                except: vocab_size = ""
                emb_simset = vecshare.extract(emb_name,'sim_vocab', set_name=set_name, case_sensitive=True,progress=False)
                score_dict  = sim_benchmark._eval_all(emb_simset)

                temp_0  ='original/'+emb_name.lower()+'.csv'
                temp_1  =emb_name.lower()

                for d in resources:
                    if d['name'] == temp_0:
                        try:
                            description = StringIO(d['description'])
                            for line in description:
                                for sent in line.split("."):
                                    for field in sent.split(","):
                                        meta_field = field.split(":")
                                        if len(meta_field) == 2:
                                            meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip()
                        except: pass
                    if d['name'] == temp_1:
                        try:
                            description = StringIO(d['description'])
                            for line in description:
                                for sent in line.split('.'):
                                    for field in sent.split(","):
                                        meta_field = field.split(":")
                                        if len(meta_field) == 2:
                                            meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip()
                        except: pass
                print ("Newly Indexed embedding: " + emb_name+ " from dataset " + set_name + ".")
                meta_dict.update(score_dict)
                meta_dict.update({
                            u'embedding_name': emb_name,
                            u"dataset_name": set_name,
                            u"contributor":contrib,
                            u"dimension":emb_dim,
                            u"vocab_size":vocab_size,
                            u"file_format":file_format,
                            u"last_updated": last_updated})
                embeddings.append(deepcopy(meta_dict))
            else:
                print ("Re-indexed embedding: " + emb_name+ " from dataset " + set_name + ".")
                query = 'SELECT * FROM '+ info.INDEX_FILE + ' WHERE dataset_name = "'+ \
                set_name +'" and embedding_name = "'+ emb_name +'"'
                prev_row = dw.query(info.INDEXER, query).dataframe
                embeddings.extend(prev_row.to_dict(orient='records'))

    with io.open(info.INDEX_FILE_PATH, 'w', encoding="utf-8") as ind:
        meta_header = set().union(*embeddings)
        csv_writer = csv.DictWriter(ind, fieldnames = meta_header)
        csv_writer.writeheader()
        for emb in embeddings:
            csv_writer.writerow(emb)

    print ("Updating index file at " + info.INDEXER_URL)
    dw_api.upload_files(info.INDEXER, info.INDEX_FILE_PATH)
    if updated:
        #_emb_rank()
        print ("Updating avg_rank signatures")
        avgrank_refresh()
        return updated
    else: return False
def save_to_dw(df, filename):
    file_path = f'/tmp/{filename}'
    df.to_csv(file_path, index=True)
    client = dw.api_client()
    client.upload_files('fryanpan13/covid-tracking-racial-data',files=file_path)