Ejemplo n.º 1
0
 def test_distributed_df_creation_with_from_dict_function_orient_index(
         self):
     df = ParallelDataFrame.from_dict(self.dict2, orient='index')
     self.assertEqual(df.globalShape, (4, 3))
     self.assertEqual(set(list(df.globalIndex)),
                      set(['key1', 'key2', 'key3', 'key4']))
     self.assertEqual(list(df.globalColumns), [0, 1, 2])
Ejemplo n.º 2
0
 def test_inplace_dropping_single_row_in_index_distributed_dataframe(self):
     df = ParallelDataFrame.from_dict(self.dict2, orient='index')
     self.assertEqual(df.globalShape, (4, 3))
     df.drop('key4', axis=0, inplace=True)
     self.assertEqual(set(list(df.globalIndex)),
                      set(['key1', 'key2', 'key3']))
     self.assertEqual(list(df.globalColumns), [0, 1, 2])
Ejemplo n.º 3
0
    def test_replicated_df_creation_with_from_dict_function_orient_index(self):
        pd_df = pd.DataFrame.from_dict(self.dict2, orient='index')
        df = ParallelDataFrame.from_dict(self.dict2,
                                         orient='index',
                                         dist='replicated')

        self.assertTrue(df.equals(pd_df))
Ejemplo n.º 4
0
 def test_global_to_local_functionality_with_index_distribution(self):
     df = ParallelDataFrame.from_dict(self.dict2, orient='index')
     self.assertTrue(isinstance(df.global_to_local, dict))
     self.assertEqual(len(df.global_to_local), 4)
     self.assertEqual(set(list(df.global_to_local.keys())),
                      set(['key1', 'key2', 'key3', 'key4']))
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--stats_file',
        help="Name and location of file where the stats should be saved")
    args = parser.parse_args()
    if args.stats_file:
        file_stats = args.stats_file
    else:
        file_stats = "./microbenchmark_stats.txt"

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    number_processors = comm.Get_size()

    data_folder1 = './MPIpandas/examples/microbenchmarks/test_data1_microbenchmarks'
    data_folder2 = './MPIpandas/examples/microbenchmarks/test_data2_microbenchmarks'

    #if file does not exist add header
    if rank == 0 and os.path.isfile(file_stats) != True:
        with open(file_stats, 'a') as file:
            file.write("#Processors, Function, Time(ms)\n")

    #---------------------------------------------------------------------------------------
    #value_counts function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_vc_transpose_40K.data",
                          low_memory=False)
    #pd_data = pd.read_csv(data_folder2+"/adult_vc_transpose_80K.data", low_memory = False)
    pd_data.index = ['workclass', 'education', 'educationNum']
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    dist_data = dist_data.apply(lambda x: x.str.lower()
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace('-', '')
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace(' ', '')
                                if x.dtype == 'object' else x)
    dist_series = dist_data.loc['education']
    comm.Barrier()
    t0 = time.time()
    dist_series.value_counts()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, value_counts, {}\n'.format(
                number_processors, (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #apply function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    comm.Barrier()
    t0 = time.time()
    new_dist_data = dist_data.apply(lambda x: x.str.lower()
                                    if x.dtype == 'object' else x)
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, apply, {}\n'.format(number_processors,
                                                (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #from_dict function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dict = pd_data.to_dict()

    comm.Barrier()
    t0 = time.time()
    dist_data = ParallelDataFrame.from_dict(dict, orient='index')
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, from_dict, {}\n'.format(number_processors,
                                                    (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #corr function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_discretized_82K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_discretized_164K.data")
    dist_data = ParallelDataFrame(pd_data)

    comm.Barrier()
    t0 = time.time()
    dist_data.corr()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, corr, {}\n'.format(number_processors,
                                               (time.time() - t0) * 1000))