Esempio n. 1
0
def create_inverted_index(dataFrame, total_words, top_words):
    inverted_index = dpd.ParallelDataFrame()
    pandas_version = get_pandas_version()

    i = 0
    for onefile in dataFrame.columns:

        if (dataFrame[onefile].isnull().all()):
            word_weights_per_file = dpd.ParallelDataFrame(np.nan,
                                                          index=['##'],
                                                          columns=[onefile])
        else:
            word_weights_per_file = dpd.ParallelDataFrame(
                dataFrame[onefile].value_counts())

        word_weights_per_file.index.name = "words"

        if (total_words[onefile] != 0):
            word_weights_per_file = word_weights_per_file.div(
                total_words[onefile])
        #else:
        #  word_weights_per_file = dpd.DistributedDataFrame(np.nan, index = ['##'], columns = [onefile])

        #if (inverted_index.empty):
        #  inverted_index = word_weights_per_file

        #else:
        if (pandas_version >= 0.23):
            inverted_index = pd.concat([inverted_index, word_weights_per_file],
                                       axis=1,
                                       sort=False)
        else:
            inverted_index = pd.concat([inverted_index, word_weights_per_file],
                                       axis=1)  #, sort = False)
        i += 1

        del word_weights_per_file

    inverted_index.columns = dataFrame.columns

    if '##' in inverted_index.index:
        inverted_index.drop('##', axis=0, inplace=True)

    inverted_index.fillna(np.float32(0), inplace=True)

    pandas_version = get_pandas_version()

    #makes sure to include all top words in inverted index
    if (pandas_version >= 0.23):
        inverted_index = pd.concat([top_words, inverted_index],
                                   axis=1,
                                   sort=False)
    else:
        inverted_index = pd.concat([top_words, inverted_index], axis=1)

    inverted_index.fillna(np.float32(0), inplace=True)

    return inverted_index
Esempio n. 2
0
def read_files(path_input):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    file_name_and_text = parallelIO.read_all(path_input, return_type='dict')
    pandas_version = get_pandas_version()
    #create df
    if (pandas_version >= 0.23):
        dataFrame = pd.DataFrame.from_dict(
            file_name_and_text, orient='index',
            columns=['text'])  #.rename(columns ={0:'text'})#\
    else:
        dataFrame = pd.DataFrame.from_dict(file_name_and_text,
                                           orient='index').rename(
                                               columns={0: 'text'})  #\
    dataFrame.index.name = "filename"
    #dataFrame.index = ['text']

    if (dataFrame is None or dataFrame.empty):
        print(
            "ERROR: Either the directory does not exist!\nOR no documents in the directory!\nOR reduce the number of processors!"
        )
        exit_program()

    return dataFrame
Esempio n. 3
0
 def loc(self):
     from .ParallelPandasUtils import _CustomLocIndexer
     pandas_version = get_pandas_version()
     if (pandas_version >= 0.25):
         return _CustomLocIndexer("loc", self)
     else:
         return _CustomLocIndexer(self, name='loc')
Esempio n. 4
0
    def __recv_and_process_for_corr(self, source, number_processors, rank, df,
                                    method, min_periods, output):
        """
    Helper routine for the corelation function, performs a blocking receive, processes the received data,
    and appends ot the dataframe that it receives as an input parameter
    Returns: a dataframe
    """
        pandas_version = get_pandas_version()
        if source >= 0 and source < number_processors:
            tag = source + rank

            recv_shape = np.zeros(2, dtype=np.int)
            self.comm.Recv([recv_shape, 2, MPI.LONG],
                           source=source,
                           tag=tag + 100)

            data_labels = []  #file names
            data_labels = self.comm.recv(source=source, tag=tag + 10)

            recv_data = np.zeros([recv_shape[0], recv_shape[1]],
                                 dtype=np.float32)
            self.comm.Recv(
                [recv_data, recv_shape[0] * recv_shape[1], MPI.FLOAT],
                source=source,
                tag=tag)

            #create a temp_df and find corelation with new files and then concat with output (corelation matrix)
            temp_df = pd.DataFrame()
            i = 0
            # create a df from the data received
            for a_label in data_labels:
                temp_df[a_label] = recv_data[:, i]
                i += 1
            del recv_data

            #add to local df
            for column in df.columns:
                temp_df[column] = df[column].values
            #-----can be done without the 2 lines below
            temp_df['index'] = df.index.values
            temp_df.set_index('index', inplace=True)

            temp_output = temp_df.corr(method=method, min_periods=min_periods)

            if (pandas_version >= 0.23):
                output = output.append(temp_output, sort=False)
                #merge duplicate rows if any
                output = output.groupby(output.index, sort=False,
                                        axis=0).min(axis=1, skipna=True)
            else:
                output = output.append(temp_output)
                # merge duplicate rows if any
                output = output.groupby(output.index, sort=False, axis=0).min()
            del temp_output
            #drop duplicate-data tht exists on other processors (comparison of docs in a processor)
            for a_label in data_labels:
                if a_label in output.index:
                    output.drop(a_label, axis=0, inplace=True)

        return output
Esempio n. 5
0
 def from_dict(cls,
               data,
               orient="columns",
               columns=None,
               dtype=None,
               comm=MPI.COMM_WORLD,
               dist='distributed') -> "ParallelDataFrame":
     """
 Class method that can create a dataframe from a dictionary based on the specified orientation (index or columns)
 Both columns and index orientation is supported
 """
     if (dist == 'distributed'):
         distributed_data = ParallelDataFrame._get_distributed_data(
             data, orient, columns, dtype, comm)
         return cls(data=distributed_data,
                    comm=comm,
                    dtype=dtype,
                    dist_data=True,
                    orient=orient)
     else:
         pandas_version = get_pandas_version()
         if pandas_version >= 0.23:
             dataFrame = pd.DataFrame.from_dict(data,
                                                orient=orient,
                                                columns=columns,
                                                dtype=dtype)
         else:
             dataFrame = pd.DataFrame.from_dict(data,
                                                orient=orient,
                                                dtype=dtype)
         return cls(data=dataFrame,
                    comm=comm,
                    dtype=dtype,
                    dist='replicated')
Esempio n. 6
0
    def apply(self,
              func,
              axis=0,
              raw=False,
              result_type=None,
              args=(),
              **kwds):
        """
    Function to apply a function along an axis of the ParallelDataFrame
    Return: Returns a parallel dataframe or paralle series which is a result of applying the function
    """
        pandas_version = get_pandas_version()
        if (pandas_version >= 0.23):
            super_return = super().apply(func,
                                         axis=axis,
                                         raw=raw,
                                         result_type=result_type,
                                         args=args)
        else:
            super_return = super().apply(func, axis=axis, raw=raw, args=args)
        #dist_data = True if self.dist == 'distributed' else False

        if (isinstance(super_return, pd.Series)):
            return self.__constructor_sliced(
                data=super_return, dist=self.dist,
                comm=self.comm)  #, dist_data = dist_data)
        elif (isinstance(super_return, pd.DataFrame)):
            return self.__constructor(
                data=super_return,
                dist=self.dist,
                comm=self.comm,
                orient=self.orient)  #, dist_data = dist_data)
Esempio n. 7
0
 def __sort_series_by_value(self, ascending, inplace):
     pandas_version = get_pandas_version()
     if (pandas_version >= 0.17):
         return self.sort_values(axis=0,
                                 ascending=ascending,
                                 inplace=inplace)  #
     else:
         return self.sort(axis=0, ascending=ascending,
                          inplace=inplace)  # for pandas 0.16
def print_pairs(similarity_matrix, file_path, pair_number):
    pandas_version = get_pandas_version()
    for col_label, row in similarity_matrix.items():
        row.dropna(inplace=True)
        for row_label, content in row.items():
            to_print = "{} {} {}\t\t\t\t {} \n".format(pair_number, col_label,
                                                       row_label, content)
            print_to_file(to_print, file_path)
            pair_number += 1
    return pair_number
Esempio n. 9
0
 def _get_distributed_data(data, orient, columns, dtype, comm):
   """
   Helper routine to distribute dictionary data uniformly
   """
   no_of_rows, keys, indices = ParallelDataFrame.__dictionary_info(data, comm)
   key_and_value = {}
   for i in indices:
     key_and_value[keys[i]] = data[keys[i]]#[)]?
   pandas_version = get_pandas_version()
   if pandas_version >= 0.23:
     dataFrame = pd.DataFrame.from_dict(key_and_value, orient = orient, columns = columns, dtype = dtype)
   else:
     dataFrame = pd.DataFrame.from_dict(key_and_value, orient = orient, dtype = dtype)
   return dataFrame
def read_files(path):
    file_name_and_text = {}
    for filename in os.listdir(path):
        with open(path + filename, "rb") as myfile:
            file_name_and_text[filename] = [str(myfile.read())]

    pandas_version = get_pandas_version()
    if pandas_version >= 0.23:
        dataFrame = pd.DataFrame.from_dict(file_name_and_text,
                                           orient='columns')
    else:
        dataFrame = pd.DataFrame.from_dict(file_name_and_text,
                                           orient='columns')
    dataFrame.index = ['text']

    return dataFrame
Esempio n. 11
0
def create_inverted_index(dataFrame, total_words, top_words):

    #create inverted index (row labels = words, column labels = filenames)
    inverted_index = serial.create_inverted_index(dataFrame, total_words,
                                                  top_words)

    pandas_version = get_pandas_version()
    #makes sure to include all top words in inverted index
    if (pandas_version >= 0.23):
        inverted_index = pd.concat([top_words, inverted_index],
                                   axis=1,
                                   sort=False)
    else:
        inverted_index = pd.concat([top_words, inverted_index], axis=1)

    inverted_index.fillna(np.float32(0), inplace=True)

    return inverted_index
def get_similar_documents(similarity_matrix):

    pandas_version = get_pandas_version()
    if (pandas_version >= 0.17):
        max_sorted = pd.DataFrame(
            similarity_matrix.max(skipna=True)).rename(columns={
                0: "score"
            }).sort_values(
                by="score", ascending=False, inplace=False
            )  #sort_values for 0.17 and higher pandas, sot_index otherwise
    else:
        max_sorted = pd.DataFrame(
            similarity_matrix.max(skipna=True)).rename(columns={
                0: "score"
            }).sort_index(by="score", ascending=False,
                          inplace=False)  #sort_index otherwise
    max_sorted.index.name = "filename"

    max_sorted.fillna(inplace=True, value=-10)

    return max_sorted
Esempio n. 13
0
def sort_df(df, col_name):
    pandas_version = get_pandas_version()
    if (pandas_version >= 0.17):
        return df.sort_values(by=col_name, ascending=False)
    else:
        return df.sort(col_name, ascending=False)
Esempio n. 14
0
def create_similarity_matrix(inverted_index):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    number_processors = comm.Get_size()
    pandas_version = get_pandas_version()

    similarity_matrix = serial.create_similarity_matrix(inverted_index)
    to_send = np.array(inverted_index.values, order='C',
                       dtype=np.float32)  #sending contiguous array
    to_send_shape = np.array(
        [inverted_index.shape[0], inverted_index.shape[1]])

    req = []
    #each rank will 1st send to one up and below, then 2 up and below and so on alternatively...until the last one
    #creates a distributed corr dataframe where data is divided by columns of dataframe
    for step in range(1, number_processors):
        #send
        destination1 = rank + step
        destination2 = rank - step

        #send to one below
        req = _send_for_simi_matrix(to_send, inverted_index.columns,
                                    to_send_shape, destination1,
                                    number_processors, req,
                                    inverted_index.shape, rank, comm)
        #send to one up
        req = _send_for_simi_matrix(to_send, inverted_index.columns,
                                    to_send_shape, destination2,
                                    number_processors, req,
                                    inverted_index.shape, rank, comm)

        #receive
        source1 = rank - step
        source2 = rank + step

        method = 'pearson'
        min_periods = 1
        #block receive and process from one above
        similarity_matrix = _recv_and_process_for_simi_matrix(
            source1, number_processors, rank, inverted_index, method,
            min_periods, similarity_matrix, comm)
        #block receive and process from one below
        similarity_matrix = _recv_and_process_for_simi_matrix(
            source2, number_processors, rank, inverted_index, method,
            min_periods, similarity_matrix, comm)

        #wait for all the sends to complete
        if (len(req) != 0):
            MPI.Request.Waitall(req)
            req = []
    del to_send
    del to_send_shape

    #output = output.transpose() # to have column distribution
    #sort so that all the rows are arragend similarly in various processors

    #similarity_matrix.sort_index(inplace = True)
    similarity_matrix = similarity_matrix.transpose()

    #removing self comparison for files
    for i in similarity_matrix.columns.values:
        similarity_matrix[i].loc[i] = np.nan

    similarity_matrix.dropna(axis=1, how='all', inplace=True)
    similarity_matrix.dropna(axis=0, how='all', inplace=True)

    return similarity_matrix
Esempio n. 15
0
class ParallelDataFrameTest(unittest.TestCase):
    def setUp(self):
        self.dict1 = {
            'key1': [10, 11, 22],
            'key2': [23, 34, 56],
            'key3': [1, 2, 3]
        }
        self.dict2 = {
            'key1': [10, 11, 22],
            'key2': [23, 34, 56],
            'key3': [1, 2, 3],
            'key4': [29, 38, 47]
        }
        self.dict3 = {
            'key1': [10, 11, 22],
            'key2': [23, 34, 56],
            'key3': [1, 2, 3],
            'key4': [29, 38, 47],
            'key5': [10, 11, 22],
            'key6': [23, 34, 56],
            'key7': [1, 2, 3],
            'key8': [29, 38, 47]
        }
        self.pd_df1 = pd.DataFrame([[4.0, 9.0, 16.0, 25.0, 36.0]] * 5,
                                   columns=['A', 'B', 'C', 'D', 'E'])

        self.pd_df2 = pd.DataFrame(
            {
                'angles': [0, 3, 4],
                'degrees': [360, 180, 360],
                'equalsides': [0, 3, 2]
            },
            index=['circle', 'triangle', 'rectangle'])

        self.df_multindex = pd.DataFrame(
            {
                'angles': [0, 3, 4, 4, 5, 6],
                'degrees': [360, 180, 360, 360, 540, 720],
                'equalsides': [0, 3, 2, 4, 5, 6]
            },
            index=[['A', 'A', 'A', 'B', 'B', 'B'],
                   [
                       'circle', 'triangle', 'rectangle', 'square', 'pentagon',
                       'hexagon'
                   ]])

    def test_canary(self):
        self.assertTrue(True)

    #Testing constructor-------------------------------
    def test_creation_of_empty_parallel_dataframe(self):
        df = ParallelDataFrame()

        self.assertTrue(isinstance(df, ParallelDataFrame))
        self.assertTrue(df.empty)

    def test_replicated_df_creation_with_constructor_input_dictionary(self):
        df = pd.DataFrame(self.dict1)
        rep_df = ParallelDataFrame(self.dict1, dist='replicated')

        self.assertEqual(df.shape, rep_df.shape)
        self.assertTrue(isinstance(rep_df, ParallelDataFrame))
        self.assertEqual(rep_df.dist, 'replicated')

    def test_distributed_df_creation_with_constructor_input_dictionary(self):
        df = ParallelDataFrame(self.dict2, dist_data=False)

        self.assertEqual(df.globalShape, (3, 4))
        self.assertEqual(df.dist, "distributed")

    def test_distributed_df_creation_with_constructor_input_dataframe(self):
        df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
        dist_df = ParallelDataFrame(df, dist_data=False)

        self.assertTrue(isinstance(dist_df, ParallelDataFrame))
        self.assertEqual(dist_df.dist, 'distributed')
        self.assertEqual(dist_df.globalShape, df.shape)
        self.assertNotEqual(dist_df.shape, dist_df.globalShape)

    #Testing from_dict function----------------------------------
    def test_distributed_df_creation_with_from_dict_function_orient_index(
            self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='index')
        self.assertEqual(df.globalShape, (4, 3))
        self.assertEqual(set(list(df.globalIndex)),
                         set(['key1', 'key2', 'key3', 'key4']))
        self.assertEqual(list(df.globalColumns), [0, 1, 2])

    def test_distributed_df_creation_with_from_dict_function_orient_columns(
            self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='columns')
        self.assertEqual(df.globalShape, (3, 4))
        self.assertEqual(set(list(df.globalColumns)),
                         set(['key1', 'key2', 'key3', 'key4']))
        self.assertEqual(list(df.globalIndex), [0, 1, 2])

    def test_replicated_df_creation_with_from_dict_function_orient_index(self):
        pd_df = pd.DataFrame.from_dict(self.dict2, orient='index')
        df = ParallelDataFrame.from_dict(self.dict2,
                                         orient='index',
                                         dist='replicated')

        self.assertTrue(df.equals(pd_df))

    def test_replicated_df_creation_with_from_dict_function_orient_columns(
            self):
        pd_df = pd.DataFrame.from_dict(self.dict2, orient='columns')
        df = ParallelDataFrame.from_dict(self.dict2,
                                         orient='columns',
                                         dist='replicated')

        self.assertTrue(df.equals(pd_df))

    #Testing global_to_local property---------------------------------
    def test_global_to_local_functionality_with_column_distribution(self):
        df = ParallelDataFrame(self.dict2, dist_data=False)
        self.assertTrue(isinstance(df.global_to_local, dict))
        self.assertEqual(len(df.global_to_local), 4)
        self.assertEqual(set(list(df.global_to_local.keys())),
                         set(['key1', 'key2', 'key3', 'key4']))

    def test_global_to_local_functionality_with_index_distribution(self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='index')
        self.assertTrue(isinstance(df.global_to_local, dict))
        self.assertEqual(len(df.global_to_local), 4)
        self.assertEqual(set(list(df.global_to_local.keys())),
                         set(['key1', 'key2', 'key3', 'key4']))

    #Testing 'drop' function---------------------------------------
    def test_inplace_dropping_multiple_columns_in_column_distributed_dataframe(
            self):
        df = ParallelDataFrame(self.dict3, dist_data=False)
        self.assertEqual(df.globalShape, (3, 8))
        df.drop(['key4', 'key8'], axis=1, inplace=True)
        self.assertEqual(set(list(df.globalColumns)),
                         set(['key1', 'key2', 'key3', 'key5', 'key6', 'key7']))
        self.assertEqual(list(df.globalIndex), [0, 1, 2])

    if (get_pandas_version() >= 0.21):

        def test_inplace_dropping_multiple_columns_in_column_distributed_dataframe_specifying_columns(
                self):
            df = ParallelDataFrame(self.dict3, dist_data=False)
            self.assertEqual(df.globalShape, (3, 8))
            df.drop(columns=['key4', 'key8'], inplace=True)
            self.assertEqual(
                set(list(df.globalColumns)),
                set(['key1', 'key2', 'key3', 'key5', 'key6', 'key7']))
            self.assertEqual(list(df.globalIndex), [0, 1, 2])

    def test_non_inplace_dropping_single_column_in_column_distributed_dataframe(
            self):
        df = ParallelDataFrame(self.dict2, dist_data=False)
        self.assertEqual(df.globalShape, (3, 4))
        new_df = df.drop('key4', axis=1, inplace=False)
        self.assertEqual(set(list(new_df.globalColumns)),
                         set(['key1', 'key2', 'key3']))
        self.assertEqual(list(new_df.globalIndex), [0, 1, 2])

    def test_inplace_dropping_single_row_in_index_distributed_dataframe(self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='index')
        self.assertEqual(df.globalShape, (4, 3))
        df.drop('key4', axis=0, inplace=True)
        self.assertEqual(set(list(df.globalIndex)),
                         set(['key1', 'key2', 'key3']))
        self.assertEqual(list(df.globalColumns), [0, 1, 2])

    def test_non_inplace_dropping_single_row_in_index_distributed_dataframe(
            self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='index')
        self.assertEqual(df.globalShape, (4, 3))
        new_df = df.drop('key4', axis=0, inplace=False)
        self.assertEqual(set(list(new_df.globalIndex)),
                         set(['key1', 'key2', 'key3']))
        self.assertEqual(list(new_df.globalColumns), [0, 1, 2])

    def test_inplace_dropping_single_column_in_index_distributed_dataframe(
            self):
        df = ParallelDataFrame.from_dict(self.dict2, orient='index')
        self.assertEqual(df.globalShape, (4, 3))
        df.drop(1, axis=1, inplace=True)
        self.assertEqual(set(list(df.globalIndex)),
                         set(['key1', 'key2', 'key3', 'key4']))
        self.assertEqual(list(df.globalColumns), [0, 2])

    def test_inplace_dropping_single_row_in_column_distributed_dataframe(self):
        df = ParallelDataFrame(self.dict2, dist_data=False)
        self.assertEqual(df.globalShape, (3, 4))
        df.drop(2, axis=0, inplace=True)
        self.assertEqual(set(list(df.globalColumns)),
                         set(['key1', 'key2', 'key3', 'key4']))
        self.assertEqual(list(df.globalIndex), [0, 1])

    if (get_pandas_version() >= 0.21):

        def test_inplace_dropping_single_row_in_column_distributed_dataframe_specifying_index(
                self):
            df = ParallelDataFrame(self.dict2, dist_data=False)
            self.assertEqual(df.globalShape, (3, 4))
            df.drop(index=2, inplace=True)
            self.assertEqual(set(list(df.globalColumns)),
                             set(['key1', 'key2', 'key3', 'key4']))
            self.assertEqual(list(df.globalIndex), [0, 1])

    def test_inplace_dropping_single_row_replicated_dataframe(self):
        df = ParallelDataFrame(self.dict2, dist='replicated')
        df.drop(2, axis=0, inplace=True)
        self.assertEqual(set(list(df.globalColumns)),
                         set(['key1', 'key2', 'key3', 'key4']))
        self.assertEqual(list(df.globalIndex), [0, 1])

    def test_non_inplace_dropping_single_column_replicated_dataframe(self):
        df = ParallelDataFrame(self.dict2, dist='replicated')
        new_df = df.drop('key4', axis=1, inplace=False)
        self.assertEqual(set(list(new_df.globalColumns)),
                         set(['key1', 'key2', 'key3']))
        self.assertEqual(list(new_df.globalIndex), [0, 1, 2])

    #new index/column introduced in Pandas version 0.21
    if (get_pandas_version() >= 0.21):

        def test_non_inplace_dropping_multiple_columns_replicated_dataframe(
                self):
            df = ParallelDataFrame(self.dict3, dist='replicated')
            new_df = df.drop(columns=['key4', 'key7'], inplace=False)
            self.assertEqual(
                set(list(new_df.globalColumns)),
                set(['key1', 'key2', 'key3', 'key5', 'key6', 'key8']))
            self.assertEqual(list(new_df.globalIndex), [0, 1, 2])

        def test_non_inplace_dropping_multiple_columns_and_row_in_same_call_replicated_dataframe(
                self):
            df = ParallelDataFrame(self.dict3, dist='replicated')
            new_df = df.drop(columns=['key4', 'key7'], index=1, inplace=False)
            self.assertEqual(
                set(list(new_df.globalColumns)),
                set(['key1', 'key2', 'key3', 'key5', 'key6', 'key8']))
            self.assertEqual(list(new_df.globalIndex), [0, 2])

    #Testing apply function----------------------------------------------------
    #The examples below have been inspired by the examples given in the Pandas documentation
    def test_column_distributed_df_apply_function_sqrt_returns_distributed_df(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)
        result = df1.apply(np.sqrt)
        df3 = result.apply(np.square)

        self.assertTrue(isinstance(result, ParallelDataFrame))
        self.assertEqual(result.dist, 'distributed')
        self.assertFalse(result.equals(df1))
        self.assertTrue(df1.equals(df3))

    def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_True(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)

        pd_result = self.pd_df1.apply(np.sum, axis=0, raw=True)
        result = df1.apply(np.sum, axis=0, raw=True)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'distributed')
        self.assertEqual(set(list(result.globalIndex)),
                         set(list(pd_result.index)))
        self.assertTrue(result.collect().sort_index().equals(
            pd_result.sort_index()))

    def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_False(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)

        pd_result = self.pd_df1.apply(np.sum, axis=0, raw=False)
        result = df1.apply(np.sum, axis=0, raw=False)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'distributed')
        self.assertEqual(set(list(result.globalIndex)),
                         set(list(pd_result.index)))
        self.assertTrue(result.collect().sort_index().equals(
            pd_result.sort_index()))

    def test_replicated_df_apply_function_sqrt_returns_replicated_df(self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sqrt)
        result = df1.apply(np.sqrt)

        self.assertTrue(result.equals(pd_result))
        self.assertEqual(result.dist, 'replicated')

    def test_replicated_df_apply_function_sum_axis0_returns_replicated_series(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sum, axis=0)
        result = df1.apply(np.sum, axis=0)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))

    def test_replicated_df_apply_function_sum_axis1_returns_replicated_series(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sum, axis=1)
        result = df1.apply(np.sum, axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))

    def test_replicated_df_apply_function_list_like_result_returns_replicated_series(
            self):
        df = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1)
        result = df.apply(lambda x: [1, 2], axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))

    if (get_pandas_version() >= 0.23):

        def test_replicated_df_apply_function_list_like_result_expand_returns_replicated_df(
                self):
            df = ParallelDataFrame(self.pd_df1, dist='replicated')

            pd_result = self.pd_df1.apply(lambda x: [1, 2],
                                          axis=1,
                                          result_type='expand')
            result = df.apply(lambda x: [1, 2], axis=1, result_type='expand')

            self.assertTrue(isinstance(result, ParallelDataFrame))
            self.assertEqual(result.dist, 'replicated')
            self.assertTrue(result.equals(pd_result))

    #Testing 'div' function---------------------------------------------
    #The examples below have been inspired by the examples from the Pandas documentation

    def test_div_constant_replicated_df(self):
        df = ParallelDataFrame(self.pd_df1, dist='replicated')

        result = df.div(10)
        pd_result = self.pd_df1.div(10)
        self.assertTrue(result.equals(pd_result))

    def test_div_constant_distributed_df(self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)
        pd_df2 = self.pd_df1.div(10)
        df2 = ParallelDataFrame(pd_df2, dist_data=False)

        result = df1.div(10)
        self.assertTrue(result.equals(df2))

    def test_div_by_multiIndex_by_level_replicated_df(self):
        df = ParallelDataFrame(self.pd_df2, dist='replicated')
        rep_multindex = ParallelDataFrame(self.df_multindex, dist='replicated')

        result = df.div(rep_multindex, level=1, fill_value=0)

        pd_result = self.pd_df2.div(self.df_multindex, level=1, fill_value=0)
        self.assertTrue(result.equals(pd_result))

    #Testing slicing--------------------------------------------------
    def test_slicing_with_single_label_getting_dist_series_from_column_distributed_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        dist_df = ParallelDataFrame(data=d, dist_data=False)

        dist_series = dist_df.loc[1]
        pd_series = pd_df.loc[1]

        self.assertTrue(isinstance(dist_series, ParallelSeries))
        self.assertEqual(dist_series.dist, 'distributed')
        self.assertTrue(dist_series.collect().sort_index().equals(
            pd_series.sort_index()))

    def test_slicing_with_slice_object_getting_dist_df_in_column_distributed_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        dist_df = ParallelDataFrame(data=d, dist_data=False)

        dist_slice = dist_df.loc[0:1]
        pd_slice = pd_df.loc[0:1]
        pd_slice_dist = ParallelDataFrame(data=pd_slice, dist_data=False)

        self.assertTrue(isinstance(dist_slice, ParallelDataFrame))
        self.assertEqual(dist_slice.dist, 'distributed')
        self.assertTrue(dist_slice.equals(pd_slice_dist))

    def test_slicing_with_single_label_getting_rep_series_from_replicated_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[1]
        pd_series = pd_df.loc[1]

        self.assertTrue(isinstance(rep_series, ParallelSeries))
        self.assertEqual(rep_series.dist, 'replicated')
        self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index()))

    def test_slicing_with_list_of_labels_getting_rep_df_from_replicated_df(
            self):
        d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_slice = rep_df.loc[[0, 3]]
        pd_slice = pd_df.loc[[0, 3]]

        self.assertTrue(isinstance(rep_slice, ParallelDataFrame))
        self.assertEqual(rep_slice.dist, 'replicated')
        self.assertTrue(rep_slice.sort_index().equals(pd_slice.sort_index()))

    def test_slicing_getting_cell_value_in_replicated_df(self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[1, 'col2']
        pd_series = pd_df.loc[1, 'col2']

        self.assertEqual(rep_series, pd_series)

    def test_slicing_with_boolean_array_getting_rep_df_from_replicated_df(
            self):
        d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[[True, False, False, True]]
        pd_series = pd_df.loc[[True, False, False, True]]

        self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index()))

    #testing corr----------------------------------------------------------------------
    def test_corr_with_col_distributed_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        dist_df = ParallelDataFrame(pd_df, dist_data=False)

        dist_corr = dist_df.corr()
        pd_corr = pd_df.corr()

        #compare values of each row (rounded to 6 digits)
        for row in dist_corr.globalIndex:
            self.assertEqual(
                list(dist_corr.loc[row].collect().sort_index().round(6)),
                list(pd_corr.loc[row].sort_index().round(6)))

    def test_corr_with_replicated_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        rep_df = ParallelDataFrame(pd_df, dist='replicated')

        rep_corr = rep_df.corr()
        pd_corr = pd_df.corr()

        self.assertTrue(rep_corr.equals(pd_corr))
Esempio n. 16
0
    def drop(self,
             labels=None,
             axis=0,
             index=None,
             columns=None,
             level=None,
             inplace=False,
             errors='raise'):
        pandas_version = get_pandas_version()
        if (axis == 'columns'): axis = 1
        if (axis == 'index'): axis = 0

        super_return = self
        if (self.dist == 'distributed' and
            ((axis == 1 or columns != None) and self.orient == 'columns')
                or ((axis == 0 or index != None) and self.orient == 'index')):

            if (index != None):
                axis = 0
                labels = index
                index = None
            elif (columns != None):
                axis = 1
                labels = columns
                columns = None

            col_or_row_names = self.global_to_local.keys()
            local_labels = None

            if isinstance(labels, list):
                local_labels = []
                for a_label in labels:
                    if (a_label not in col_or_row_names):
                        raise Exception("Column/Row does not exist!")
                    elif ((axis == 1 and a_label in self.columns.values)
                          or (axis == 0 and a_label in self.index.values)):
                        local_labels.append(a_label)
            elif (labels
                  not in col_or_row_names):  #incase labels is not a list
                raise Exception("Column does not exist!")
            elif ((axis == 1 and labels in self.columns.values)
                  or (axis == 0 and labels in self.index.values)):
                local_labels = labels

            if (inplace == True and self.get_global_to_local() != None):
                self._global_to_local = None

            #perform drop
            if (local_labels is not None and len(local_labels) != 0):
                if (pandas_version >= 0.21):
                    super_return = super().drop(local_labels, axis, index,
                                                columns, level, inplace,
                                                errors)
                else:
                    super_return = super().drop(local_labels, axis, level,
                                                inplace, errors)
            elif (inplace == True
                  ):  # when some other node had the item to be dropped
                return self
            else:
                self.__constructor(data=self,
                                   dist=self.dist,
                                   comm=self.comm,
                                   orient=self.orient,
                                   dist_data=True)

        # for replicated distribution OR
        # for dropping a row in a column-distribution OR
        # a column in row-distribution
        else:
            if (pandas_version >= 0.21):
                super_return = super().drop(labels, axis, index, columns,
                                            level, inplace, errors)
            else:
                super_return = super().drop(labels, axis, level, inplace,
                                            errors)

        if (inplace == True):
            return self
        else:
            return self.__constructor(data=super_return,
                                      dist=self.dist,
                                      comm=self.comm,
                                      orient=self.orient)