Example #1
0
def uniq_parks_counts_dask(filename):
    '''
    Write a Python script using Dask that counts the number of trees
    treated in each park and prints a list of "park,count" pairs in a CSV
    manner ordered alphabetically by the park name. Every element in the list
    must be printed on a new line.
    Test file: tests/test_uniq_parks_counts_dask.py
    Note: The return value should be a CSV string
          Have a look at the file *tests/list_parks_count.txt* to get the exact return format.
    '''

    # ADD YOUR CODE HERE
    dd = df.read_csv(filename, dtype={'Nom_parc': str})
    dd_filtered = dd[dd.Nom_parc.notnull()]
    db = dd_filtered.to_bag()
    result = (db.filter(lambda col: col[6] != None).map(
        lambda col: col[6]).frequencies())

    # dd_filtered = dd_filtered.drop_duplicates().compute()
    # out = dd_filtered.groupby(by = 'Nom_parc')['Nom_parc'].value_counts().compute().sort_values(ascending=False)
    string = ""
    separator = '\n'
    for i in sorted(result):
        string += i[0] + ',' + str(i[1]) + '\n'

    return string
    raise Exception("Not implemented yet")
Example #2
0
 def median(bmin, bmax, bcount):
     step = (bmax - bmin) / 2.
     proposal = bmin + step
     while step > epsilon:
         largercount = bag.filter(lambda x: x > proposal).count().compute()
         if largercount == bcount // 2:
             return proposal
         step /= 2
         if largercount > bcount // 2:
             proposal += step
         else:
             proposal -= step
     return None
Example #3
0
def frequent_parks_count_dask(filename):
    '''
    Write a Python script using Dask that prints the list of the 10 parks
    with the highest number of treated trees. Parks must be ordered by
    decreasing number of treated trees and by alphabetical order when they have
    similar number.  Every list element must be printed on a new line.
    Test file: tests/test_frequent_parks_count_dask.py
    Note: The return value should be a CSV string.
          Have a look at the file *tests/frequent.txt* to get the exact return format.
    '''

    # ADD YOUR CODE HERE
    #     dd = df.read_csv(filename, dtype={'Nom_parc': str})
    #     dd_filtered = dd[dd.Nom_parc.notnull()]
    #     db = dd_filtered.to_bag()
    #     result = (db.filter(lambda col: col[6]!=None)
    #     		.map(lambda col: col[6])
    #     		.frequencies(sort=True).topk(10))
    #     string = ""
    #     separator = '\n'
    #     for i in result:
    #     	string+=i[0]+','+str(i[1])+'\n'
    # #     file1 = open("unique_parks_count_dask5.txt", "w+")
    # #     file1.write(string)
    # #     file1.close()
    #     return string

    dd = df.read_csv(filename, dtype={'Nom_parc': str})
    dd_filtered = dd[dd.Nom_parc.notnull()]
    db = dd_filtered.to_bag()
    result = (db.filter(lambda col: col[6] != None).map(
        lambda col: col[6]).frequencies().topk(10, key=1))
    # frame = result.to_dataframe()
    # frame = frame.sort_values(by = 'Nom_parc')
    listoftuples = sorted(result, key=lambda col: -col[1])
    string = ""
    separator = '\n'
    for x in listoftuples:
        string += x[0] + ',' + str(x[1]) + separator


#     file1 = open("frequent_dask5.txt", "w+")
#     file1.write(string)
#     file1.close()
    return string

    raise Exception("Not implemented yet")
Example #4
0
def filter_func(data):
    return data['payload']['originationCountryCode'] == 'USA'


for num_workers in [2, 3, 4]:
    test_name = "dsk_filter_cnt_{}_{}".format('threaded', num_workers)
    LOGGER.info('BEGIN: Running test: {}'.format(test_name))

    LOGGER.info('START: Creating dask bag with filter')
    bag = dask.bag.read_avro(
        URLPATH1,
        storage_options={'config_kwargs': {
            'max_pool_connections': 500
        }},
        blocksize=None)
    bag = bag.filter(filter_func)
    LOGGER.info('FINISH: Dask bag created')

    LOGGER.info('START: Creating dask dataframe')
    df = bag.to_dataframe(meta={'payload': 'object', 'metadata': 'object'})
    LOGGER.info('FINISH: Dask dataframe created')

    LOGGER.info('START: Starting count')
    cnt = df.payload.count().compute(get=th_get, num_workers=num_workers)
    LOGGER.info('FINISH: Count is %s', cnt)

    LOGGER.info('COMPLETE: Running test: {}'.format(test_name))

test_name = "dsk_filter_cnt_{}_{}".format('synchronous', 1)
LOGGER.info('BEGIN: Running test: {}'.format(test_name))