Exemple #1
0
    def read_text_using_dask(self):

        self.ip_producer_name = KafkaProducer(
            bootstrap_servers=self.bootstrap_servers_list,
            value_serializer=lambda x: json.dumps(x).encode('utf-8'))

        self.ipdf_name = readtextfile.ReadTextFile(
            self.ipfile, self.ipschemafile, self.delimiter, self.skiprows,
            self.parallel, self.compression).read_using_dask()

        for i in range(self.ipdf_name.npartitions):
            self.ip_producer_name.send(
                self.ip_topic_name,
                self.ipdf_name.get_partition(i).compute().to_json(
                    orient="records"))

        self.ip_producer_name.close()
Exemple #2
0
        print('Actor already registered: {}'.format(actor_name))
    except ValueError:
        flow1_actors[actor_name] = Pipeline.options(
            name=actor_name, lifetime="detached").remote()
        flow1_actors[actor_name]

print("duration =", timer() - start, " seconds for registering actors")
'''
for actor_name in actor_names:
    flow1_actors[actor_name] = ray.get_actor(actor_name)
'''

df = readtextfile.ReadTextFile(
    ipfile='/tmp/data/5m_Sales_Records.csv',
    ipschemafile=
    '/Users/sriyan/Documents/dataprocessor/schema/sample_csv_file.schema',
    delimiter=',',
    skiprows=1,
    parallel=3).read_using_dask()

result_ids = []
for i, actor_name in enumerate(actor_names):
    result_ids.append(flow1_actors[actor_name].transform.remote(df, i))

sum = 0
while len(result_ids):
    done_id, result_ids = ray.wait(result_ids)
    # passing results to another function as soon as they are available
    sum = process_incremental(sum, ray.get(done_id[0]))

print("duration =",
# Copyright 2020 The Nadi Data Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

import readtextfile
import readparquetfile
import writeparquetfile
import writetextfile

if __name__ == "__main__":
    """ Read uncompressed text file"""
    df = readtextfile.ReadTextFile(
        ipfile='/Users/sriyan/Downloads/1500000_Sales_Records.csv',
        ipschemafile='schema/sample_csv_file.txt',
        delimiter=',',
        skiprows=1,
        parallel=4).read_using_dask()

    fwf_df = readtextfile.ReadTextFile(
        ipfile='/Users/sripri/Downloads/sample_fwf.txt',
        ipschemafile=
        '/Users/sripri/Documents/dataprocessor/schema/sample_fwf.types',
        delimiter='fixed width',
        parallel=4).read_using_dask()
    """ Write compressed text file with single_file as True"""
    writetextfile.WriteTextFile(
        ipdf=df,
        filename="/Users/sriyan/Downloads/sample_textfile.gz",
        single_file=True,
        encoding='utf-8',
        sep='|',
from kafka import KafkaConsumer, KafkaProducer
import json
import readtextfile

bootstrap_servers_list = "localhost:9092"
topic_name = 'sample_csv_file'

producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers_list,
    value_serializer=lambda x: json.dumps(x).encode('utf-8'))

df = readtextfile.ReadTextFile(
    ipfile='/Users/sriyan/Downloads/1500000_Sales_Records.csv',
    ipschemafile='schema/sample_csv_file.txt',
    delimiter=',',
    skiprows=1,
    parallel=4).read_using_dask()

for i in range(df.npartitions):
    producer.send(topic_name,
                  df.get_partition(i).compute().to_dict(orient="records"))

producer.close()
print("All messages published to input topic")