Example #1
0
def transform_local(stream_dtc_file, window_dtc_file, raw_json_files,
                    output_file) -> int:
    runner = LocalRunner(stream_dtc_file, window_dtc_file)
    result = runner.execute(
        runner.get_identity_records_from_json_files(raw_json_files,
                                                    SimpleJsonDataProcessor()))
    runner.write_output_file(output_file, result)
Example #2
0
 def get_record_rdd_from_json_files(
         self,
         json_files: List[str],
         data_processor: DataProcessor = SimpleJsonDataProcessor(),
         spark_session: Optional['SparkSession'] = None) -> 'RDD':
     spark_context = get_spark_session(spark_session).sparkContext
     raw_records: 'RDD' = spark_context.union(
         [spark_context.textFile(file) for file in json_files])
     return raw_records.mapPartitions(
         lambda x: self.get_per_identity_records(
             x, data_processor)).groupByKey().mapValues(list)
Example #3
0
 def get_identity_records_from_json_files(
     self,
     json_files: List[str],
     data_processor: DataProcessor = SimpleJsonDataProcessor()
 ) -> Dict[str, List[TimeAndRecord]]:
     identity_records = defaultdict(list)
     for file in json_files:
         with smart_open(file) as file_stream:
             for identity, record_with_datetime in self.get_per_identity_records(
                     file_stream, data_processor):
                 identity_records[identity].append(record_with_datetime)
     return identity_records
Example #4
0
    def get_record_rdd_from_json_files(
            self,
            json_files: List[str],
            data_processor: DataProcessor = SimpleJsonDataProcessor(),
            spark_session: Optional['SparkSession'] = None) -> 'RDD':
        """
        Reads the data from the given json_files path and converts them into the `Record`s format for
        processing. `data_processor` is used to process the per event data in those files to convert
        them into `Record`.

        :param json_files: List of json file paths. Regular Spark path wildcards are accepted.
        :param data_processor: `DataProcessor` to process each event in the json files.
        :param spark_session: `SparkSession` to use for execution. If None is provided then a basic
            `SparkSession` is created.
        :return: RDD containing Tuple[Identity, List[TimeAndRecord]] which can be used in
            `execute()`
        """
        spark_context = get_spark_session(spark_session).sparkContext
        raw_records: 'RDD' = spark_context.union(
            [spark_context.textFile(file) for file in json_files])
        return raw_records.mapPartitions(
            lambda x: self.get_per_identity_records(
                x, data_processor)).groupByKey().mapValues(list)
Example #5
0
def test_simple_json_processor_success():
    data_processor = SimpleJsonDataProcessor()
    assert data_processor.process_data('{"test": 1}') == [Record({'test': 1})]
Example #6
0
def test_simple_json_processor_invalid_json_error():
    data_processor = SimpleJsonDataProcessor()
    with pytest.raises(Exception):
        data_processor.process_data('a')