def read_text_using_dask(self): self.ip_producer_name = KafkaProducer( bootstrap_servers=self.bootstrap_servers_list, value_serializer=lambda x: json.dumps(x).encode('utf-8')) self.ipdf_name = readtextfile.ReadTextFile( self.ipfile, self.ipschemafile, self.delimiter, self.skiprows, self.parallel, self.compression).read_using_dask() for i in range(self.ipdf_name.npartitions): self.ip_producer_name.send( self.ip_topic_name, self.ipdf_name.get_partition(i).compute().to_json( orient="records")) self.ip_producer_name.close()
print('Actor already registered: {}'.format(actor_name)) except ValueError: flow1_actors[actor_name] = Pipeline.options( name=actor_name, lifetime="detached").remote() flow1_actors[actor_name] print("duration =", timer() - start, " seconds for registering actors") ''' for actor_name in actor_names: flow1_actors[actor_name] = ray.get_actor(actor_name) ''' df = readtextfile.ReadTextFile( ipfile='/tmp/data/5m_Sales_Records.csv', ipschemafile= '/Users/sriyan/Documents/dataprocessor/schema/sample_csv_file.schema', delimiter=',', skiprows=1, parallel=3).read_using_dask() result_ids = [] for i, actor_name in enumerate(actor_names): result_ids.append(flow1_actors[actor_name].transform.remote(df, i)) sum = 0 while len(result_ids): done_id, result_ids = ray.wait(result_ids) # passing results to another function as soon as they are available sum = process_incremental(sum, ray.get(done_id[0])) print("duration =",
# Copyright 2020 The Nadi Data Authors. All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. import readtextfile import readparquetfile import writeparquetfile import writetextfile if __name__ == "__main__": """ Read uncompressed text file""" df = readtextfile.ReadTextFile( ipfile='/Users/sriyan/Downloads/1500000_Sales_Records.csv', ipschemafile='schema/sample_csv_file.txt', delimiter=',', skiprows=1, parallel=4).read_using_dask() fwf_df = readtextfile.ReadTextFile( ipfile='/Users/sripri/Downloads/sample_fwf.txt', ipschemafile= '/Users/sripri/Documents/dataprocessor/schema/sample_fwf.types', delimiter='fixed width', parallel=4).read_using_dask() """ Write compressed text file with single_file as True""" writetextfile.WriteTextFile( ipdf=df, filename="/Users/sriyan/Downloads/sample_textfile.gz", single_file=True, encoding='utf-8', sep='|',
from kafka import KafkaConsumer, KafkaProducer import json import readtextfile bootstrap_servers_list = "localhost:9092" topic_name = 'sample_csv_file' producer = KafkaProducer( bootstrap_servers=bootstrap_servers_list, value_serializer=lambda x: json.dumps(x).encode('utf-8')) df = readtextfile.ReadTextFile( ipfile='/Users/sriyan/Downloads/1500000_Sales_Records.csv', ipschemafile='schema/sample_csv_file.txt', delimiter=',', skiprows=1, parallel=4).read_using_dask() for i in range(df.npartitions): producer.send(topic_name, df.get_partition(i).compute().to_dict(orient="records")) producer.close() print("All messages published to input topic")