Beispiel #1
0
def simscore(test_set="score"):
    '''
    Select the embedding scoring highest on the `test_set` word comparison task.
    Score is calculated by measuring the average Spearman correlation of the word
    vector cosine similarities and human-rated similarity for each word pair.
    Missing words are substituted with random words

    Args:
        test_set (str,opt): Embedding will be returned with highest score on `test_set`
            If test_set is not specified, embedding will be returned with highest
            average score over all similarity tasks (excluding MC-30)

    Returns:
        emb_name: Name of Embedding with highest simscore

    Word Pair Similarity Tasks:
    * WS-353: Finkelstein et. al, 2002
    * MC-30: Miller and Charles, 1991
        (Excluded from composite score, already included in SimLex-999 Test Set)
    * MEN: Bruni et. al, 2012
    * MTurk-287: Radinsky et. al, 2011
    * MTurk-771: Halawi and Dror, 2012
    * Rare-Word: Luong et. al, 2013
    * SimLex-999: Hill et. al, 2014
    * SimVerb-3500: Gerz et. al, 2016
    * Verb-144: Baker et. al, 2014
    '''
    max_query = "SELECT MAX(" + test_set +") FROM " +info.INDEX_FILE
    max_simscore = np.floor(100*dw.query(info.INDEXER, max_query).dataframe.iloc[0][0])/100
    emb_query = "SELECT embedding_name FROM " + info.INDEX_FILE + " WHERE " +test_set +" >= " + str(max_simscore)
    top_emb = dw.query(info.INDEXER, emb_query).dataframe
    return top_emb.iloc[0][0]
def test_toplevel_query(dw_instances, profile):
    datadotworld.query('agentid/datasetid',
                       'SELECT * FROM Tables',
                       profile=profile)
    assert_that(
        dw_instances[profile].query,
        called().times(1).with_args(equal_to('agentid/datasetid'),
                                    equal_to('SELECT * FROM Tables'),
                                    query_type=equal_to('sql')))
Beispiel #3
0
def maxtkn():
    '''Returns the embedding trained on the most tokens.

    Args: None

    Returns: emb_name
    '''

    max_query = "SELECT MAX(token_count) FROM " +info.INDEX_FILE
    token_count = dw.query(info.INDEXER, max_query).dataframe.iloc[0][0]
    emb_query = "SELECT embedding_name FROM " + info.INDEX_FILE + " WHERE token_count >= " + str(token_count)
    top_emb = dw.query(info.INDEXER, emb_query).dataframe
    return top_emb.iloc[0][0]
Beispiel #4
0
def join_population_age_range(df):
    # Load population data
    population_query = dw.query(
        'fryanpan13/covid-tracking-racial-data',
        'SELECT * FROM population_estimates_age_ranges')
    population_df = population_query.dataframe

    population_df = population_df.rename(
        columns={
            'state': COL_STATE,
            'state_name': 'State Name',
            'age_range': COL_AGE_GROUP,
            'race_ethnicity': COL_RACE_ETHNICITY,
            'dataset': 'Dataset',
            'geo_state_name': 'Geo State Name',
            'standard_weight': COL_STANDARD_WEIGHT,
            'population': COL_POPULATION
        })

    population_df[COL_POPULATION] = population_df[COL_POPULATION].astype(
        'Int64')

    # Load region data
    region_query = dw.query('fryanpan13/covid-tracking-racial-data',
                            'SELECT * FROM regions')
    region_df = region_query.dataframe

    region_df = region_df.rename(columns={'state': COL_STATE, 'region': 'Region', 'subregion': 'Subregion'}) \
                         .drop(columns=['state_name'])

    population_df = population_df.set_index([COL_STATE])
    region_df = region_df.set_index([COL_STATE])
    population_df = population_df.merge(region_df, on=[COL_STATE])

    population_index = [
        COL_DATASET, COL_STATE, COL_AGE_GROUP, COL_RACE_ETHNICITY
    ]
    population_df = population_df.reset_index(
        drop=False).set_index(population_index)

    # Set expected deaths to a dummy value
    population_df['Expected Deaths'] = 0

    print(population_df.columns)
    print(df.columns)

    # Join population
    return df.join(population_df, population_index)
Beispiel #5
0
def query(words, emb_name, set_name=None, case_sensitive=False):
    """Query a set of word vectors from an indexed embedding.
	Args:
		words (List of strings): The set of word vectors being queried
		emb_name (str): The embedding from which word vectors are being queried
		set_name (str, opt): Name of dataset being queried (format: owner/id)
		case_sensitive (bool, opt): Flag for matching exact case in query

	Returns:
		Pandas Dataframe, each row specifying a word vector.
	 """
    set_name = _error_check(emb_name, set_name)
    if case_sensitive: title = 'text'
    else:
        words = [word.lower() for word in words]
        title = 'lower(text)'
    try:
        if len(words) > 1:
            query = 'SELECT * FROM ' + emb_name + ' where ' + title + ' in' + str(
                tuple(words))
        else:
            query = 'SELECT * FROM ' + emb_name + ' where ' + title + ' = "' + words[
                0] + '"'
        results = dw.query(set_name, query).dataframe
        return results
    except:
        raise RuntimeError(
            "Embedding is formatted improperly. Check headers at: " + query)
    raise RuntimeError("No matching word vector found.")
Beispiel #6
0
def _emb_rank():
    query = 'SELECT embedding_name, dataset_name, contributor, embedding_type, dimension, score \
        FROM ' + info.INDEX_FILE
    results = dw.query(info.INDEXER,query).dataframe
    results = results.nlargest(10, 'score')
    for ind,row in results.iterrows():
        results.loc[ind, 'embedding_name'] = \
        "[`"+row['embedding_name']+"`]("+ info.BASE_URL + row['dataset_name']+")"

    results = results.drop('dataset_name', axis=1)
    md_table = tabulate(results, headers=list(results), tablefmt="pipe",showindex=False)
    with io.open('../README.md', 'r', encoding='utf-8') as readme:
        pre, post = True,False
        pre_table,post_table = '',''
        for line in readme:
            if pre:
                pre_table += line
                if line == '[comment]: <> (Leaderboard Start)\n':
                    pre = False
            if post: post_table+= line
            if line == '[comment]: <> (Leaderboard End)\n':
                post_table = line
                post = True
    with io.open('../README.md', 'w', encoding='utf-8') as readme:
        readme.write(pre_table+'\n')
        readme.write(md_table+'\n\n')
        readme.write(post_table)
def college_score(name):
    dataset_key = 'https://data.world/education/university-rankings-2017'
    try:
        results = dw.query(
            dataset_key=dataset_key,
            query=
            "SELECT *FROM national_universities_rankings WHERE name = '%s'" %
            name)
        df = results.dataframe
        rank = (df.iloc[0]['rank'])
        score = 1
        if rank <= 5:
            score = 6
        if rank <= 10:
            score = 5
        if rank > 50 & rank < 100:
            score = 4
        if rank > 100 & rank < 150:
            score = 3
        if rank > 150 & rank < 200:
            score = 2
        if rank > 200:
            score = 1
        return score
    except:
        print("Your College Isn't In Here!!")
Beispiel #8
0
def _error_check(emb_name, set_name=None, vs_format=None):
    if set_name and vs_format: return set_name, vs_format
    emb_list = dw.query(info.INDEXER_URL,
                        'SELECT * FROM ' + info.INDEX_FILE).dataframe
    emb_names = emb_list.embedding_name

    if len(emb_list.loc[emb_list['embedding_name'] ==
                        emb_name]) > 1 and set_name == None:
        raise ValueError("More than one embedding exists with name: " +
                         emb_name + "Try again specifying set_name.")
    if emb_name not in emb_names.values:
        raise ValueError("No embedding exists with name: " + emb_name)
    if emb_list.file_format[emb_names == emb_name].iloc[0] not in [
            'csv', 'tsv', 'xls', 'rdf', 'json'
    ]:
        raise TypeError(
            emb_name +
            " was not uploaded in a queryable format. Try reuploading as: csv, tsv, xls, rdf, or json."
        )

    set_name = emb_list.loc[emb_list['embedding_name'] ==
                            emb_name].dataset_name.iloc[0]
    vs_format = emb_list.loc[emb_list['vs_format'] ==
                             vs_format].dataset_name.iloc[0]
    return set_name, vs_format
Beispiel #9
0
def download(emb_name, set_name=None, vs_format=None):
    '''Loads the full shared embedding `emb_name` and saves the embedding to the
	current working directory.

	Args:
		emb_name(str): Name of the selected embedding
		set_name(opt, str): Specify if multiple embeddings exist with same name

        Returns:
		.csv embedding saved to the current working directory
	'''
    DW_API_TOKEN = os.environ['DW_AUTH_TOKEN']
    multiproc = pp.ProcessPool(proc_cnt)

    def emb_download(appx_num):
        query_url = "https://query.data.world/file_download/" + set_name + "-appx" + str(
            appx_num) + "/" + emb_name + "-appx" + str(appx_num) + '.csv'
        payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN}
        emb_text = requests.request("GET",
                                    query_url,
                                    data=payload,
                                    headers=headers).text
        with io.open(emb_name + "-appx" + str(appx_num) + '.csv',
                     'wb') as download_emb:
            download_emb.write(emb_text)

    set_name, vs_format = _error_check(emb_name,
                                       set_name=set_name,
                                       vs_format=vs_format)
    if vs_format == 'large':
        num_appx = "SELECT app_num FROM " + info.INDEX_FILE + " WHERE embedding_name = " + emb_name + " and dataset_name = " + set_name
        app_count = dw.query(info.INDEXER, num_appx).dataframe.iloc[0][0]
        multiproc.map(emb_download, list(range(num_appx)))

        with io.open(emb_name + '.csv', 'wb') as compiled:
            first_appx = io.open(emb_name + '-appx0.csv',
                                 'r',
                                 encoding='utf-8')
            compiled.write(first_appx.read())
            for i in range(1, app_count):
                with io.open(emb_name + "-appx" + str(i) + '.csv',
                             'r',
                             encoding='utf-8') as appx:
                    appx.next()
                    for line in appx:
                        compiled.write(line)
                os.remove(emb_name + "-appx" + str(i) + '.csv')
    else:
        query_url = "https://query.data.world/file_download/" + set_name + "/" + emb_name + '.csv'
        payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN}
        emb_text = requests.request("GET",
                                    query_url,
                                    data=payload,
                                    headers=headers).text
        with io.open(emb_name + '.csv', 'wb') as download_emb:
            download_emb.write(emb_text)
    return pd.read_csv(emb_name + '.csv')
Beispiel #10
0
def query_dataworld():
    save_name = 'raw/dataworld/all_'+str(datetime.now().date())
    if os.path.exists('data/'+save_name+'.pkl.tgz'):
        print('data found in',save_name)
        return pd.read_pickle('data/'+save_name+'.pkl.tgz')
    else:
        results = dw.query('covid-19-data-resource-hub/covid-19-case-counts', 'SELECT * FROM  covid_19_cases')
        df = results.dataframe
        save(df,name='raw/dataworld/all_'+str(datetime.now().date()))
        return df
Beispiel #11
0
def check():
    """Displays indexed word embeddings and associated metadata.

	Args: NONE
	Returns:
		Pandas Dataframe containing avaiable word embeddings and metadata
	"""
    embedding_list = dw.query(info.INDEXER_URL,
                              'SELECT * FROM ' + info.INDEX_FILE)
    df = embedding_list.dataframe
    cols = df.columns.tolist()
    title = ["embedding_name", "dataset_name", "contributor"]
    return df[title + [field for field in cols if field not in title]]
Beispiel #12
0
# datadotworld module has been imported as dw
import datadotworld as dw

# We've written a SPARQL query for you and assigned it to the `sparql_query` variable: 
sparql_query = "PREFIX GOT: <https://tutorial.linked.data.world/d/sparqltutorial/> SELECT ?FName ?LName WHERE {?person GOT:col-got-house \"Stark\" . ?person GOT:col-got-fname ?FName . ?person GOT:col-got-lname ?LName .}"

# Use the pre-defined SPARQL query to query dataset http://data.world/tutorial/sparqltutorial and return the results to a queryResults variable
queryResults = dw.query('http://data.world/tutorial/sparqltutorial', sparql_query, query_type='sparql')

# Use the dataframe property of the resulting query to create a dataframe variable named `houseStark`
houseStark = queryResults.dataframe

# Use pp.pprint() to print the dataframe to the screen.
pp.pprint(houseStark)
Beispiel #13
0
import os
import geoviews as gv
from cartopy import crs as ccrs

gv.extension('bokeh', 'matplotlib')

# Define project root directory
# ROOT_DIR = Path(__file__).parent.parent
ROOT_DIR = Path(os.getcwd()).parent

# %% [markdown]
# We next query and download the latest covid data from data.world servers.

# %%
# Get latest covid data
results = dw.query('covid-19-data-resource-hub/covid-19-case-counts',
                   'SELECT * FROM covid_19_cases')
results_df = results.dataframe.drop(['prep_flow_runtime', 'difference'],
                                    axis="columns")
results_df = (results_df[results_df["table_names"] == "Time Series"].drop(
    ["table_names"], axis='columns'))
results_df['date'] = pd.to_datetime(results_df['date'])

# %%

### US STATISTICS (county level)

# Import shapefile of US states and
states_shp = gpd.read_file(
    ROOT_DIR.joinpath("data/tl_2017_us_states/tl_2017_us_state.shp"))
states_shp = states_shp[["NAME", "geometry"]].rename(columns={'NAME': 'State'})
Beispiel #14
0
import datadotworld as d

td = intro_dataset = d.load_dataset('rfabbri/test1')

q = """SELECT * WHERE {?s ?p ?o}"""

q2 = """
PREFIX po: <http://purl.org/socialparticipation/po/>
SELECT ?s WHERE {?s a po:Participant}
"""

r = d.query('rfabbri/test1', q2, query_type='sparql')
def download_dataframe():
    query = """SELECT does_this_tweet_contain_hate_speech, tweet_text
               FROM twitter_hate_speech_classifier_dfe_a845520"""
    result = dw.query('crowdflower/hate-speech-identification', query)
    df = result.dataframe
    return df
Beispiel #16
0
# datadotworld module has been imported as dw
import datadotworld as dw

## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location
sql_query = "SELECT state, count(fmid) as count, Avg(obesity.Value) as obesityAvg FROM Export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable.
queryResults = dw.query('https://data.world/agriculture/national-farmers-markets', sql_query)

## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats`
stateStats = queryResults.dataframe

## Plot the stateStats results using state as the x-axis (matplotlib is already imported)
stateStats.plot(x='state')

plt.show()
Beispiel #17
0
def join_population_all(df):
    # Load population data
    population_query = dw.query('fryanpan13/covid-tracking-racial-data',
                                'SELECT * FROM population_data')
    population_df = population_query.dataframe

    population_df = population_df.rename(
        columns={
            'state': COL_STATE,
            'state_name': 'State Name',
            'race': COL_RACE_ETHNICITY,
            'dataset': 'Dataset',
            'geo_state_name': 'Geo State Name',
            'population': COL_POPULATION
        })

    population_df[COL_POPULATION] = population_df[COL_POPULATION].astype(
        'Int64')
    print('unknown ethnicity data')
    print(population_df[population_df[COL_RACE_ETHNICITY] ==
                        'Unknown Ethnicity'])

    # Load region data
    region_query = dw.query('fryanpan13/covid-tracking-racial-data',
                            'SELECT * FROM regions')
    region_df = region_query.dataframe

    region_df = region_df.rename(columns={'state': COL_STATE, 'region': 'Region', 'subregion': 'Subregion'}) \
                         .drop(columns=['state_name'])

    # Load expected deaths data
    expected_deaths_query = dw.query('fryanpan13/covid-tracking-racial-data',
                                     'SELECT * FROM expected_deaths')
    expected_deaths_df = expected_deaths_query.dataframe[[
        'state', 'race', 'dataset', 'expected_deaths'
    ]]
    expected_deaths_df = expected_deaths_df.rename(
        columns={
            'state': COL_STATE,
            'dataset': COL_DATASET,
            'race': COL_RACE_ETHNICITY,
            'expected_deaths': 'Expected Deaths'
        })
    expected_deaths_df['Expected Deaths'] = expected_deaths_df[
        'Expected Deaths'].astype('float')

    population_df = population_df.set_index([COL_STATE])
    region_df = region_df.set_index([COL_STATE])
    population_df = population_df.merge(region_df, on=[COL_STATE])

    population_index = [COL_DATASET, COL_STATE, COL_RACE_ETHNICITY]
    population_df = population_df.reset_index(
        drop=False).set_index(population_index)

    expected_deaths_df = expected_deaths_df.reset_index(
        drop=False).set_index(population_index)
    population_df = population_df.merge(expected_deaths_df,
                                        on=population_index)

    print(population_df.columns)
    print(df.columns)

    # Join population
    return df.join(population_df, population_index)
# datadotworld module has been imported as dw
import datadotworld as dw

## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable.
sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable.
query2010 = dw.query("https://data.world/nrippner/refugee-host-nations",
                     sql_query)

## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010`
unhcr2010 = query2010.dataframe

## Print the first 5 rows using the head method.
pp.pprint(unhcr2010.head(5))
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 8:

# datadotworld module has been imported as dw
import datadotworld as dw

# We've written a SPARQL query for you and assigned it to the `sparql_query` variable:
sparql_query = "PREFIX GOT: <https://tutorial.linked.data.world/d/sparqltutorial/> SELECT ?FName ?LName WHERE {?person GOT:col-got-house \"Stark\" . ?person GOT:col-got-fname ?FName . ?person GOT:col-got-lname ?LName .}"

# Use the pre-defined SPARQL query to query dataset http://data.world/tutorial/sparqltutorial and return the results to a queryResults variable
queryResults = dw.query('http://data.world/tutorial/sparqltutorial', sparql_query, query_type='sparql')

# Use the dataframe property of the resulting query to create a dataframe variable named `houseStark`
houseStark = queryResults.dataframe

# Use pp.pprint() to print the dataframe to the screen.
pp.pprint(houseStark)
def get_state_reporting_category():
    state_reporting_df = dw.query('fryanpan13/covid-tracking-racial-data',
                                  'SELECT * FROM state_reporting_category')
    return state_reporting_df.dataframe
def get_age_ranges():
    # Load standard weights by age range
    age_range_query = dw.query('fryanpan13/covid-tracking-racial-data',
                               'SELECT * FROM standard_population_weights')
    return age_range_query.dataframe
  email    : [email protected]
             [email protected]

DataDotWorldBBallStats.name  pointspergame  assistspergame  \
0                         Jon           20.4             1.3   
1                         Rob           15.5             8.0   
2                      Sharon           30.1            11.2   
3                        Alex            8.2             0.5   
4                     Rebecca           12.3            17.0   
5                      Ariane           18.1             3.0   
6                       Bryon           16.0             8.5   
7                        Matt           13.0             2.1   

  DataDotWorldBBallTeam.name  height handedness  
0                        Jon    6'5"      Right  
1                        Rob  6'7.5"       Left  
2                     Sharon    6'3"      Right  
3                       Alex    6'2"      Right  
4                    Rebecca      7'      Right  
5                     Ariane    5'8"       Left  
6                      Bryon      7'      Right  
7                       Matt    5'5"      Right  
"""

import datadotworld as dw
name='jonloyens/an-intro-to-dataworld-dataset'
results = dw.query(name,
    'SELECT * FROM `DataDotWorldBBallStats`, `DataDotWorldBBallTeam` '
    'WHERE DataDotWorldBBallTeam.Name = DataDotWorldBBallStats.Name')
df = results.dataframe
print(df)
Beispiel #23
0
def refresh(force_update=False):
    '''
    Crawls for new embeddings with the tag and update the index file with new
    embedding sets, or changes to existing shared embeddings.

    Args:
        force_update(bool, opt): Hard reset, re-index ALL available embeddings.
            If False, only scrape metadata or new embedding sets.
    Returns:
        None. Uploads new index_file.csv to indexer on data store.
    '''
    # Retrieve source for data.world:vecshare search results
    display = Display(visible=0, size=(800, 600))
    display.start()
    wd= webdriver.Firefox(executable_path="/usr/bin/firefox", capabilities= {"marionette": False })

    page_num, set_count, sets = 1, 1000, []

    while set_count > len(sets):
        wd.get(info.DATASETS_URL + "?page="+str(page_num))
        try:
            WebDriverWait(wd,5).until(EC.visibility_of_element_located((By.CLASS_NAME, info.DW_CLASS_TAG)))
        except: pass
        soup    = BeautifulSoup(wd.page_source, 'lxml')
        set_txt = soup.find('h1','TopicView__headline___2_0-1').text
        set_count = [int(s) for s in set_txt.split() if s.isdigit()][0]
        sets.extend([s["href"][1:] for s in soup.find_all('a', info.DW_CLASS_TAG)])
        page_num += 1
    dw_api  = dw.api_client()
    wd.close()
    print ("Found " + str(len(sets)) + " sets with the " + info.EMB_TAG + " tag.")

    embeddings, prev_indexed, updated = [], [], False
    if not force_update:
        prev_query = dw.query(info.INDEXER, 'SELECT dataset_name, embedding_name FROM '+ info.INDEX_FILE).dataframe
        for ind, row in prev_query.iterrows():
            prev_indexed.append("/".join(row.values))

    for set_name in sets:
        curr_set  = dw.load_dataset(set_name,force_update = True) # Embedding
        curr_meta = dw_api.get_dataset(set_name)
        set_updated = parse(curr_meta['updated'])
        meta_dict = dict()
        contrib   = curr_meta["owner"]
        resources = curr_set.describe()['resources']

        summary = StringIO(curr_meta["summary"])
        for line in summary:
            for field in line.split(","):
                for sent in field.split("."):
                    try:
                        meta_field = field.split(":")
                        if len(meta_field) == 2:
                            meta_dict[meta_field[0].strip().lower().replace(" ", "_").replace("-", "_")] = meta_field[1].strip()
                    except: pass

        for each in curr_meta['files']:
            emb_name = each['name'][:-4]
            emb_updated = parse(each['updated'])
            try:
                ind_query = 'SELECT last_updated FROM '+ info.INDEX_FILE + \
                ' WHERE dataset_name = "'+ set_name +'" and embedding_name = "'+emb_name+'"'
                query_results = dw.query(info.INDEXER, ind_query).dataframe.iloc[0].values[0]
                last_indexed = parse(query_results)
                if emb_updated > set_updated: last_updated = emb_updated
                else: last_updated =  set_updated
            except:
                last_updated = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
                last_indexed = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
                pass

            # Index if new embedding or if metadata/embedding updated since last Index
            if (force_update) or (set_name + '/' + emb_name not in prev_indexed) or  (last_indexed < last_updated) :
                try: curr_emb = curr_set.describe(emb_name.lower())
                except: continue
                updated = True
                emb_dim = len(curr_emb['schema']['fields']) - 1
                file_format = curr_emb['format']
                try: vocab_size = dw.query(set_name , "SELECT COUNT(text) FROM " + emb_name).dataframe.iloc[0][0]
                except: vocab_size = ""
                emb_simset = vecshare.extract(emb_name,'sim_vocab', set_name=set_name, case_sensitive=True,progress=False)
                score_dict  = sim_benchmark._eval_all(emb_simset)

                temp_0  ='original/'+emb_name.lower()+'.csv'
                temp_1  =emb_name.lower()

                for d in resources:
                    if d['name'] == temp_0:
                        try:
                            description = StringIO(d['description'])
                            for line in description:
                                for sent in line.split("."):
                                    for field in sent.split(","):
                                        meta_field = field.split(":")
                                        if len(meta_field) == 2:
                                            meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip()
                        except: pass
                    if d['name'] == temp_1:
                        try:
                            description = StringIO(d['description'])
                            for line in description:
                                for sent in line.split('.'):
                                    for field in sent.split(","):
                                        meta_field = field.split(":")
                                        if len(meta_field) == 2:
                                            meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip()
                        except: pass
                print ("Newly Indexed embedding: " + emb_name+ " from dataset " + set_name + ".")
                meta_dict.update(score_dict)
                meta_dict.update({
                            u'embedding_name': emb_name,
                            u"dataset_name": set_name,
                            u"contributor":contrib,
                            u"dimension":emb_dim,
                            u"vocab_size":vocab_size,
                            u"file_format":file_format,
                            u"last_updated": last_updated})
                embeddings.append(deepcopy(meta_dict))
            else:
                print ("Re-indexed embedding: " + emb_name+ " from dataset " + set_name + ".")
                query = 'SELECT * FROM '+ info.INDEX_FILE + ' WHERE dataset_name = "'+ \
                set_name +'" and embedding_name = "'+ emb_name +'"'
                prev_row = dw.query(info.INDEXER, query).dataframe
                embeddings.extend(prev_row.to_dict(orient='records'))

    with io.open(info.INDEX_FILE_PATH, 'w', encoding="utf-8") as ind:
        meta_header = set().union(*embeddings)
        csv_writer = csv.DictWriter(ind, fieldnames = meta_header)
        csv_writer.writeheader()
        for emb in embeddings:
            csv_writer.writerow(emb)

    print ("Updating index file at " + info.INDEXER_URL)
    dw_api.upload_files(info.INDEXER, info.INDEX_FILE_PATH)
    if updated:
        #_emb_rank()
        print ("Updating avg_rank signatures")
        avgrank_refresh()
        return updated
    else: return False
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 6:

# datadotworld module has been imported as dw
import datadotworld as dw

## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable.
sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable.
query2010 = dw.query('https://data.world/nrippner/refugee-host-nations', sql_query)

## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010`
unhcr2010 = query2010.dataframe

## Print the first 5 rows using the head method.
pp.pprint(unhcr2010.head(5))
Beispiel #25
0
def query(words,
          emb_name,
          set_name=None,
          case_sensitive=False,
          download=False,
          vs_format=None):
    """Query a set of word vectors from an indexed embedding.
	Args:
		words (List of strings): The set of word vectors being queried
		emb_name (str): The embedding from which word vectors are being queried
		set_name (str, opt): Name of dataset being queried (format: owner/id)
		case_sensitive (bool, opt): Flag for matching exact case in query
		download(bool,opt): Flag for saving query results
	Returns:
		Pandas Dataframe, each row specifying a word vector.
	 """

    # Set names should have '-' and file names should use "_"
    def partial_query(args):
        return dw.query(args[0], args[1]).dataframe

    if case_sensitive: title = 'text'
    else:
        words = [word.lower() for word in words]
        title = 'lower(text)'
    set_name, vs_format = _error_check(emb_name,
                                       set_name=set_name,
                                       vs_format=vs_format)
    query_list, proc_cnt = [], 16
    ind_results, combined_vecs = pd.DataFrame(), pd.DataFrame()
    multiproc = pp.ProcessPool(proc_cnt)

    if vs_format == 'large':
        try:
            ind_query = 'SELECT * FROM ' + emb_name + ' where ' + title
            if len(words) == 1:
                cond = '"' + words[0] + '"'
                ind_results = dw.query(set_name, ind_query + cond).dataframe
            else:
                for i in range(0, len(words), 400):
                    query_words = words[i:i + 400]
                    query_list.append([
                        set_name, ind_query + ' in' + str(tuple(query_words))
                    ])
                word_index = multiproc.map(partial_query, query_list)
                word_index = [word for word in word_index]
                for each in word_index:
                    ind_results = ind_results.append(each)
        except:
            RuntimeError(
                "Embedding is formatted improperly. Check upload at: " +
                set_name)
        num_appx = "SELECT app_num FROM " + info.INDEX_FILE + " WHERE embedding_name = " + emb_name + " and dataset_name = " + set_name
        app_count = dw.query(info.INDEXER, num_appx).dataframe.iloc[0][0]

    query_list = []
    if vs_format == 'large':
        for each in range(app_count):
            base_query = 'SELECT * FROM ' + emb_name.lower() + "_appx" + str(
                each) + ' where ' + title
            ind_appcnt = (ind_results[(ind_results['app_setname'] == set_name +
                                       "-appx" + str(each))])['text']
            for i in range(0, len(ind_appcnt), 400):
                if len(words) > 1: cond = ' in' + str(tuple(words[i:i + 400]))
                else: cond = ' = "' + words[0] + '"'
                query_list.append(
                    [set_name + "-appx" + str(each), base_query + cond])
    else:
        base_query = 'SELECT * FROM ' + emb_name.lower().replace(
            '-', "_") + ' where ' + title
        for i in range(0, len(words), 400):
            if len(words) > 1: cond = ' in' + str(tuple(words[i:i + 400]))
            else: cond = ' = "' + words[0] + '"'
            query_list.append([set_name, base_query + cond])

    try:
        word_vecs = multiproc.map(partial_query, query_list)
    except:
        import pdb
        pdb.set_trace()
        RuntimeError("Improper Query: " + query_list[0])
    for each in word_vecs:
        combined_vecs = combined_vecs.append(each)

    if download == True:
        with io.open(emb_name + '_query.csv', 'wb',
                     encoding='utf-8') as extract_csv:
            extract_emb.to_csv(extract_csv, encoding='utf-8', index=False)
        multiproc.terminate()

    return combined_vecs
Beispiel #26
0
from pymake.utils.common.prettymessaging import PrettyMessaging
import datadotworld

pm = PrettyMessaging('Workshop-Spark')

results = datadotworld.query('jrm/traffic-violations',
                             "SELECT * FROM traffic_violations")

results_df = results.dataframe

pm.print_dict(results_df)

results_df.to_csv('data/raw.csv', index=False)
Beispiel #27
0
 def partial_query(args):
     return dw.query(args[0], args[1]).dataframe
                                          how='left',
                                          left_on='state',
                                          right_on='stusab')

## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '.
merged_dataframe['citystate'] = merged_dataframe[
    'city'] + ',' + merged_dataframe['state_name']

## Print first 5 rows of merged_dataframe
pp.pprint(merged_dataframe.head())

## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable.
sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable.
query2010 = dw.query('https://data.world/nrippner/refugee-host-nations',
                     query=sql_query)

## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010`
unhcr2010 = query2010.dataframe

## Print the first 5 rows using the head method.
pp.pprint(unhcr2010.head())

## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location
sql_query = "SELECT state, count(FMID) as count, Avg(obesity.value) as obesityAvg FROM export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable.
queryResults = dw.query(
    "https://data.world/agriculture/national-farmers-markets", sql_query)

## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats`
# userid: berkj
# Email: [email protected]
# Assignment Number: assignment1
# Honor statement: I pledge on my honor that I have neither given nor
#  received unauthorized aid on this assignment.
# Exercise 6:

# datadotworld module has been imported as dw
import datadotworld as dw

## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location
sql_query = "SELECT state, count(fmid) as count, Avg(obesity.Value) as obesityAvg FROM Export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc"

## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable.
queryResults = dw.query('https://data.world/agriculture/national-farmers-markets', sql_query)

## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats`
stateStats = queryResults.dataframe

## Plot the stateStats results using state as the x-axis (matplotlib is already imported)
stateStats.plot(x='state')

plt.show()
Beispiel #30
0
import datetime
from datetime import date
from datetime import timedelta

from plotnine import *

# open and format data world sql query.
file = open("C:/Users/Riesser/Documents/python_popup/dw_pull.txt")
line = file.read().replace("\n", " ")

#remove white space and create a string value
line = ' '.join(line.split())
file.close()

#run query with "line" variable
results = dw.query('associatedpress/johns-hopkins-coronavirus-case-tracker',
                   line)
df = results.dataframe

#read temp file with data for testing
# df = pd.read_csv('temp.csv')
# df = df.drop('Unnamed: 0', 1)

#create variable and column for today's date
today = date.today()
df['recorded_date'] = today

old_data_date = today - datetime.timedelta(days=2)

# append dataset into older data
df.to_csv('C:/Users/Riesser/Documents/python_popup/df_storage.csv',
          mode='a',