Beispiel #1
0
def load_count_vec_data(spark, hdfs_dir_name, dict_dir_name):

    hdfs_dir_name = add_slash_to_dir(hdfs_dir_name)
    dict_dir_name = add_slash_to_dir(dict_dir_name)

    count_vector_matrix = load_count_vector_matrix(
        spark, hdfs_dir_name + 'count_vector_matrix')
    col_dict = load_dict(dict_dir_name + 'col_dict.json')
    return col_dict, count_vector_matrix
Beispiel #2
0
def save_dict(output_dir_dict, out_dict, filename='col_dict.json'):
    output_dir_dict = add_slash_to_dir(output_dir_dict)
    output_name_dict = output_dir_dict + filename
    make_sure_path_exists(output_dir_dict)
    json.dump(out_dict, open(output_name_dict, mode='w'))
Beispiel #3
0
def save_rdd_mat(output_dir_rdd, rdd_matrix, filename='count_vector_matrix'):
    output_dir_rdd = add_slash_to_dir(output_dir_rdd)
    output_name_rdd = output_dir_rdd + filename
    rdd_matrix.saveAsTextFile(output_name_rdd)
    print('***** RDD matrix saved. *****')
Beispiel #4
0
def load_id_list(dir_name):
    return pickle.load(
        open(add_slash_to_dir(dir_name) + 'test_ids.pickle', mode='rb'))
Beispiel #5
0
def save_id_list(id_list, dir_name):
    pickle.dump(
        id_list, open(add_slash_to_dir(dir_name) + 'test_ids.pickle',
                      mode='wb'))