def uniq_parks_counts_dask(filename): ''' Write a Python script using Dask that counts the number of trees treated in each park and prints a list of "park,count" pairs in a CSV manner ordered alphabetically by the park name. Every element in the list must be printed on a new line. Test file: tests/test_uniq_parks_counts_dask.py Note: The return value should be a CSV string Have a look at the file *tests/list_parks_count.txt* to get the exact return format. ''' # ADD YOUR CODE HERE dd = df.read_csv(filename, dtype={'Nom_parc': str}) dd_filtered = dd[dd.Nom_parc.notnull()] db = dd_filtered.to_bag() result = (db.filter(lambda col: col[6] != None).map( lambda col: col[6]).frequencies()) # dd_filtered = dd_filtered.drop_duplicates().compute() # out = dd_filtered.groupby(by = 'Nom_parc')['Nom_parc'].value_counts().compute().sort_values(ascending=False) string = "" separator = '\n' for i in sorted(result): string += i[0] + ',' + str(i[1]) + '\n' return string raise Exception("Not implemented yet")
def median(bmin, bmax, bcount): step = (bmax - bmin) / 2. proposal = bmin + step while step > epsilon: largercount = bag.filter(lambda x: x > proposal).count().compute() if largercount == bcount // 2: return proposal step /= 2 if largercount > bcount // 2: proposal += step else: proposal -= step return None
def frequent_parks_count_dask(filename): ''' Write a Python script using Dask that prints the list of the 10 parks with the highest number of treated trees. Parks must be ordered by decreasing number of treated trees and by alphabetical order when they have similar number. Every list element must be printed on a new line. Test file: tests/test_frequent_parks_count_dask.py Note: The return value should be a CSV string. Have a look at the file *tests/frequent.txt* to get the exact return format. ''' # ADD YOUR CODE HERE # dd = df.read_csv(filename, dtype={'Nom_parc': str}) # dd_filtered = dd[dd.Nom_parc.notnull()] # db = dd_filtered.to_bag() # result = (db.filter(lambda col: col[6]!=None) # .map(lambda col: col[6]) # .frequencies(sort=True).topk(10)) # string = "" # separator = '\n' # for i in result: # string+=i[0]+','+str(i[1])+'\n' # # file1 = open("unique_parks_count_dask5.txt", "w+") # # file1.write(string) # # file1.close() # return string dd = df.read_csv(filename, dtype={'Nom_parc': str}) dd_filtered = dd[dd.Nom_parc.notnull()] db = dd_filtered.to_bag() result = (db.filter(lambda col: col[6] != None).map( lambda col: col[6]).frequencies().topk(10, key=1)) # frame = result.to_dataframe() # frame = frame.sort_values(by = 'Nom_parc') listoftuples = sorted(result, key=lambda col: -col[1]) string = "" separator = '\n' for x in listoftuples: string += x[0] + ',' + str(x[1]) + separator # file1 = open("frequent_dask5.txt", "w+") # file1.write(string) # file1.close() return string raise Exception("Not implemented yet")
def filter_func(data): return data['payload']['originationCountryCode'] == 'USA' for num_workers in [2, 3, 4]: test_name = "dsk_filter_cnt_{}_{}".format('threaded', num_workers) LOGGER.info('BEGIN: Running test: {}'.format(test_name)) LOGGER.info('START: Creating dask bag with filter') bag = dask.bag.read_avro( URLPATH1, storage_options={'config_kwargs': { 'max_pool_connections': 500 }}, blocksize=None) bag = bag.filter(filter_func) LOGGER.info('FINISH: Dask bag created') LOGGER.info('START: Creating dask dataframe') df = bag.to_dataframe(meta={'payload': 'object', 'metadata': 'object'}) LOGGER.info('FINISH: Dask dataframe created') LOGGER.info('START: Starting count') cnt = df.payload.count().compute(get=th_get, num_workers=num_workers) LOGGER.info('FINISH: Count is %s', cnt) LOGGER.info('COMPLETE: Running test: {}'.format(test_name)) test_name = "dsk_filter_cnt_{}_{}".format('synchronous', 1) LOGGER.info('BEGIN: Running test: {}'.format(test_name))