Ejemplo n.º 1
0
    def get_dict_features_from_df_parallel(self, df, nworkers=8):

        print("extracting features...")

        df_split = np.array_split(df, nworkers)
        pool = Pool(nworkers)
        res_dicts = pool.map(self.get_dict_features_from_df, df_split)
        pool.close(
        )  #  informs the processor that no new tasks will be added to the pool
        pool.join(
        )  # stops and waits for all of the results to be finished and collected before proceeding with the rest of

        big_dic = defaultdict(lambda: defaultdict(int))

        # merge feature dictionaries created for data frame splits into one big dictionary
        for dic in res_dicts:
            for k, v in dic.items():
                big_dic[k] = v

        return pd.concat([
            pd.get_dummies(df[df.columns.difference(["event", "venue"])],
                           prefix="@",
                           columns=["month", "weekday"]),
            pd.DataFrame.from_dict(big_dic, orient='index')
        ],
                         axis=1,
                         join_axes=[df.index]).fillna(0.)
	def parallelize_dataframe(self, df, func):

   		df_split = np.array_split(df, 1)
   		pool = Pool(1)
   		rr = pool.map(func, df_split)
   		df = pd.concat(rr)
   		pool.close()
   		pool.join()

   		return df
Ejemplo n.º 3
0
def parallel_launcher(data_dir, data, worker, pool_size, files_num):
    files = modified_get_files(data_dir)

    batches = [(files[i:i + files_num], data, j)
               for j, i in enumerate(range(0, len(files), files_num))]

    pool = Pool(pool_size)
    output = pool.starmap(worker, batches)

    pool.close()
    pool.join()

    return output