Python RDD.collect Examples

Programming Language: Python

Namespace/Package Name: pyspark.rdd

Class/Type: RDD

Method/Function: collect

Examples at hotexamples.com: 6

Python RDD.collect - 6 examples found. These are the top rated real world Python examples of pyspark.rdd.RDD.collect extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

map(15)

RDD(8)

flatMap(8)

collect(6)

count(6)

mapPartitions(5)

filter(4)

transform(3)

aggregate(2)

getNumPartitions(2)

__init__(2)

mapValues(2)

repartition(2)

saveAsTextFile(1)

keys(1)

sample(1)

_to_java_object_rdd(1)

values(1)

Example #1

Show file

File: ProfileProcesser.py Project: jerry940/BigData-1

def calculate_weights(rdd_in: RDD) -> RDD:
    rdd_in = rdd_in.collect()
    total: int = compute_sum_of_values(rdd_in)
    total_counts_for_the_four.append(total)

    return SC.parallelize(rdd_in).map(lambda x:
                                      (x[0], x[1], round(100 * x[1] / total)))

Example #2

Show file

File: testing.py Project: peloton-mqiu/spark-test

def assert_rdd_equal(expected: Collection,
                     result: RDD,
                     check_order: bool = True):
    """
    Compare two RDD or one RDD with a Collection

    :param expected: A Collection to compare. For convenience, doesn't need to be a RDD.
    :param result: The RDD to compare.
    :param check_order: Compare the order of values.
    """

    if isinstance(expected, RDD):
        expected = expected.collect()
    else:
        expected = [_ for _ in expected]

    result = result.collect()

    # length comparison
    msg = f'RDD length {len(result)} does not match expected {len(expected)}'
    assert len(expected) == len(result), msg

    # value comparison
    if check_order is True:
        assert expected == result
    else:
        assert Counter(expected) == Counter(result)

Example #3

Show file

File: load.py Project: ruibinch/vrn-analysis

def run(log: logging.Log4j, config: Dict[str, str], n_cols: int,
        vrn_rdd_tfm: RDD, results_rdd: RDD, prices_rdd_tfm: RDD) -> None:
    """Runner of Load phase.

    Loads the transformed RDDs back into the respective worksheets in GSheets:
    - vrn_rdd_tfm - "VRNCleaned" worksheet
    - results_rdd - "Results" worksheet
    - prices_rdd_tfm - "Prices" worksheet

    Args:
        log: Log4j object
        config: Key-value mappings of config values
        n_cols: Number of columns in original VRN worksheet
        vrn_rdd_tfm: Transformed VRN RDD
        results_rdd: Results RDD
        prices_rdd_tfm: Transformed car prices RDD
    """

    # config values used
    spreadsheet_id = config[constants.CONFIG_GSHEET_SPREADSHEET_ID_DEV]
    ws_title_vrn_cleaned = config[constants.CONFIG_GSHEET_WS_VRN_CLEANED]
    ws_title_results = config[constants.CONFIG_GSHEET_WS_RESULTS]
    ws_title_prices = config[constants.CONFIG_GSHEET_WS_PRICES]

    # load VRN RDD and save to "VRNCleaned" worksheet
    vrn_data_tfm_flattened = vrn_rdd_tfm.collect()
    # Split this list into chunks, where each chunk is the number of elements per row
    vrn_data_tfm = list(genhelpers._chunks(vrn_data_tfm_flattened, n_cols))
    vrn_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_vrn_cleaned,
                                        vrn_data_tfm, False)
    _log_load_resp(log, ws_title_results, vrn_resp)

    # load results RDD and save to "Results" worksheet
    results_data_flattened = results_rdd.collect()
    # Split this list into chunks, where each chunk is the number of elements per row
    results_data = list(genhelpers._chunks(results_data_flattened, n_cols))
    results_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_results,
                                            results_data, False)
    _log_load_resp(log, ws_title_results, results_resp)

    # load prices RDD and save to "Prices" worksheet
    prices_data_tfm = prices_rdd_tfm.collect()
    prices_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_prices,
                                           prices_data_tfm, True)
    _log_load_resp(log, ws_title_prices, prices_resp)

    return None

Example #4

Show file

File: tagsCountStreaming.py Project: xyliaox233/cloudcomputing

def writeFile(rdd: RDD):
    num = rdd.count()
    if num > 0:
        result_dic = open_result()
        print(result_dic)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace(
                    "'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        result = open(path, 'w', encoding='utf-8')
        result.write(
            json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()

Example #5

Show file

File: nameCountStreaming.py Project: xyliaox233/cloudcomputing

def writeFile(rdd: RDD, f_type):
    num = rdd.count()
    global save1
    global save2
    if num > 0:
        result_dic = open_result(f_type)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace("'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        if f_type == 0:
            save1 = result_dic
        if f_type == 1:
            save2 = result_dic
        result = open(path[f_type], 'w', encoding='utf-8')
        result.write(json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()

Example #6

Show file

File: test_execution_engine.py Project: fugue-project/fugue

 def m_o(ctx: SparkContext, data: pr.RDD) -> None:
     assert isinstance(ctx, SparkContext)
     assert 1 == len(data.collect())