Ejemplo n.º 1
0
 def process(self, cardo_context: CardoContextBase,
             cardo_dataframe: CardoDataFrame,
             gt: CardoDataFrame) -> CardoDataFrame:
     dataframe = cardo_dataframe.dataframe
     ground_truth_dataframe = gt.dataframe
     true_positive_count, all_positives_count = self.__get_intersections(
         dataframe, ground_truth_dataframe)
     precision_value = self.__get_precision_value(true_positive_count,
                                                  all_positives_count)
     if self.precision_column:
         cardo_dataframe.dataframe = dataframe.withColumn(
             self.precision_column, F.lit(precision_value))
     source_name = self.source_name if self.source_name else cardo_dataframe.table_name
     cardo_context.logger.info(
         f"precision calculation for {source_name} -> "
         f"matches: {true_positive_count}, intersection: {all_positives_count}, "
         f"precision: {precision_value}, "
         f"is friendly: {self.friendly_precision}",
         extra={
             "gt_match": true_positive_count,
             "id":
             f"{self.match_column}_for_{self.intersection_column}_{gt.table_name}",
             "log_type": self.log_type,
             "count": all_positives_count,
             "statistic_value": precision_value,
             "statistic_type": "precision",
             "table_name": source_name,
             "base_table": gt.table_name
         })
     return cardo_dataframe
Ejemplo n.º 2
0
 def test_created_with_pandas_returns_rdd_correctly(self):
     pandas = self.context.spark.createDataFrame(
         [['aa']], 'column1: string').toPandas()
     cardo_dataframe = CardoDataFrame(pandas, '6')
     self.assertIsInstance(cardo_dataframe.rdd, RDD)
     self.assertItemsEqual(pandas.values[0][0],
                           cardo_dataframe.rdd.collect()[0][0])
Ejemplo n.º 3
0
 def process(self, cardo_context, cardo_dataframe=None):
     # type: (CardoContextBase) -> CardoDataFrame
     df = cardo_context.spark.sql(self.query)
     cardo_context.logger.info(
         'finished to read query: {table_name} from Hive MetaStore'.format(
             table_name=self.query))
     return CardoDataFrame(df, self.table)
Ejemplo n.º 4
0
	def process(self, cardo_context, cardo_dataframe=None):
		# type: (CardoContext) -> CardoDataFrame
		cardo_context.spark.catalog.refreshByPath(self.path)
		data = CardoDataFrame(cardo_context.spark.read.load(self.path, self.format, self.schema, **self.options))
		cardo_context.logger.info(
			u'read data from Hdfs from {path} successfully'.format(path=self.path))
		return data
Ejemplo n.º 5
0
 def test_created_with_dataframe_returns_pandas_correctly(self):
     dataset = self.context.spark.createDataFrame([['a']],
                                                  'column1: string')
     cardo_dataframe = CardoDataFrame(dataset, '6')
     self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame)
     self.assertItemsEqual(dataset.collect()[0][0],
                           cardo_dataframe.pandas.values[0][0])
Ejemplo n.º 6
0
 def test_created_with_dataframe_returns_dataframe_correctly(self):
     dataset = self.context.spark.createDataFrame([['a']],
                                                  'column1: string')
     cardo_dataframe = CardoDataFrame(dataset, '6')
     self.assertIsInstance(cardo_dataframe.dataframe, DataFrame)
     self.assertEqual(dataset.collect(),
                      cardo_dataframe.dataframe.collect())
Ejemplo n.º 7
0
	def process(self, cardo_context, cardo_dataframe=None):
		# type: (CardoContext) -> CardoDataFrame
		cardo_context.logger.info("Reading data from: {} index".format(self.resource))
		function_keeper = self.override_spark_to_str()
		df = cardo_context.spark.read.format(ELASTIC_FORMAT).options(**self.options_dict).load()
		self.return_to_str_to_original_function(function_keeper)
		cardo_context.logger.info("Read data from: {} index successfully".format(self.resource))
		return CardoDataFrame(df)
Ejemplo n.º 8
0
 def test_condition_runtime_test(self):
     dataset = CardoDataFrame(
         self.context.spark.createDataFrame([['a'], ['b']], 'col1: string'))
     acc_test = StepAccumulatorRuntimeTest(lambda x: x == 'a', 'unittest')
     acc_test.test(self.context, dataset).dataframe.collect()
     self.assertEqual(
         1,
         sum(map(lambda record: record.pm_value, self.log_handler.records)))
Ejemplo n.º 9
0
 def test_rdd_accumulation_without_rows_without_special_columns(self):
     dataset = CardoDataFrame(
         self.context.spark.sparkContext.parallelize(['1']))
     acc_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest')
     acc_test.test(self.context, dataset).rdd.collect()
     self.assertEqual(
         1,
         sum(map(lambda record: record.pm_value, self.log_handler.records)))
Ejemplo n.º 10
0
    def test_empty_table(self):
        # Arrange
        dataset_no_rows = CardoDataFrame(
            self.context.spark.createDataFrame([], schema="column: string"))
        dataset_no_cols = CardoDataFrame(
            self.context.spark.createDataFrame([[]]))
        no_rows_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest')
        no_cols_test = StepAccumulatorRuntimeTest(lambda x: x, 'unittest')

        # Act
        no_rows_test.test(self.context, dataset_no_rows).dataframe.collect()
        no_cols_test.test(self.context, dataset_no_cols).dataframe.collect()

        # Assert
        self.assertEqual(
            0,
            sum(map(lambda record: record.pm_value, self.log_handler.records)))
Ejemplo n.º 11
0
 def process(self, cardo_context, cardo_dataframe=None):
     # type: (CardoContextBase) -> CardoDataFrame
     df = cardo_context.spark.table(self.table_name)
     if self.partitions is not None:
         df = df.repartition(self.partitions)
     cardo_context.logger.info(
         'finished to read table: {table_name} from Hive MetaStore'.format(
             table_name=self.table_name))
     return CardoDataFrame(df, self.table_name)
Ejemplo n.º 12
0
 def process(self, cardo_context, *cardo_dataframes):
     pool = ThreadPool(self.parallel)
     Elasticsearch(self.host).indices.delete(index=self.index,
                                             ignore=[400, 404])
     pool.map(
         lambda cardo_dataframe: self.__write_dataframe_to_elastic(
             cardo_context, cardo_dataframe), cardo_dataframes)
     unioned = self.__read_from_elastic(cardo_context)
     return CardoDataFrame(unioned, 'unioned')
Ejemplo n.º 13
0
 def process(self, cardo_context, cardo_dataframe):
     cardo_context.spark.udf.registerJavaFunction(self.config.udf_name,
                                                  self.config.location)
     df = cardo_dataframe.dataframe
     df = df.withColumn(
         CONFIG_OUTPUT_COLUMN,
         F.expr("{udf_name}{udf_params}".format(
             udf_name=self.udf_name,
             udf_params=self.generate_udf_parameters())))
     df = self.extract_columns_and_drop(df)
     return CardoDataFrame(df)
Ejemplo n.º 14
0
 def process(self, cardo_context, *cardo_dataframes):
     # type: (CardoContextBase, [CardoDataFrame]) -> CardoDataFrame
     assert len(cardo_dataframes) > 0
     result = self._rename_columns(cardo_dataframes[0])
     for cardo_dataframe in cardo_dataframes[1:]:
         df = self._rename_columns(cardo_dataframe)
         result = result.join(df, self.index_col, how='outer')
     hash_to_df_dict[step_to_hash_dict[self]] = result
     if cardo_context.spark.conf.get(SAVE_RESULTS_AT_RUNTIME_CONFIG, 'True') != 'True':
         self._save_all_steps_results(cardo_context)
     return CardoDataFrame(result)
Ejemplo n.º 15
0
    def test_step_without_unique_tests_should_only_count(self):
        class Test(IStep):
            def process(self, cardo_context, cardo_dataframe):
                return cardo_dataframe

        dataset = CardoDataFrame(
            self.context.spark.createDataFrame([['a']], 'col1: string'))
        result = RuntimeTestableStep(Test()).process(self.context, dataset)
        result.dataframe.collect()
        self.assertEqual(1, self.get_pm_summary('count'))
        self.assertEqual(2, len(self.log_handler.records))
Ejemplo n.º 16
0
 def process(self, cardo_context, cardo_dataframe=None):
     # type: (CardoContext, None) -> CardoDataFrame
     sheet = utils.find_sheet(cardo_context,
                              utils.read_file(cardo_context, self.filename),
                              self.sheet_name)
     table = self.__table_from_sheet(sheet)
     headers = utils.filter_invisible(sheet.row_values(0))
     schema = str([str(utils.clean_invisible(cell))
                   for cell in headers]).replace("'", "")[1:-1]
     dataframe = self.__create_dataframe(cardo_context, schema, table)
     return CardoDataFrame(dataframe, sheet.name)
Ejemplo n.º 17
0
    def process(self, cardo_context, *dataframes):
        # type: (CardoContextBase,list) -> CardoDataFrame
        df = union_dataframes(dataframes)
        df = df.dataframe

        df = self.get_log_of_grades(df)
        df = self.combine_sources(df)
        df = self.convert_grade_back_to_normal(df)
        if not self.allow_ones:
            df = self.fix_ones(df)

        return CardoDataFrame(df)
Ejemplo n.º 18
0
 def send_df_to_qsm(self, cardo_context, splited_dataframe):
     with JDBCQuerier(cardo_context, self.connection_string,
                      ORACLE_DRIVER) as querier:
         qsm_queue_config = self._get_queue_config(cardo_context)
         stage_table_name = self.prepare_stage_table(
             cardo_context, CardoDataFrame(splited_dataframe),
             qsm_queue_config, querier)
     if self.send_stage_to_qsm:
         with JDBCQuerier(cardo_context, self.qsm_connection_string,
                          ORACLE_DRIVER) as qsm_querier:
             qsm_querier.execute(
                 self._create_main_query(cardo_context, qsm_queue_config,
                                         stage_table_name))
             self._log_added_stage_table_to_qsm(cardo_context,
                                                stage_table_name)
Ejemplo n.º 19
0
	def process(self, cardo_context, cardo_dataframe=None):
		# type: (CardoContext) -> CardoDataFrame
		df = cardo_context.spark.read.jdbc(
				self.connection_string,
				self.table_name,
				column=self.parallel_col,
				lowerBound=self.lower_bound,
				upperBound=self.upper_bound,
				numPartitions=self.num_parallel,
				properties=self.properties
		)
		cardo_context.logger.info(
				u'read data with {reader} from table {table_name} using connection string : {connection_string} properties: {properties}'.format(
					reader=self.__class__.__name__, table_name=self.table_name, connection_string=self.connection_string, properties=self.properties))
		return CardoDataFrame(dataframe=df, table_name=self.table_name)
Ejemplo n.º 20
0
    def test_multiple_inputs_and_specific_test_on_specific_table(self):
        class Test(IStep):
            def process(self,
                        cardo_context,
                        cardo_dataframe,
                        another_dataframe=None):
                cardo_dataframe.dataframe = cardo_dataframe.dataframe.union(
                    another_dataframe.dataframe)
                return cardo_dataframe

            @IStep.pm_input(cardo_dataframe_index=1)
            def is_null(self, value):
                return value is None

        dataset = CardoDataFrame(
            self.context.spark.createDataFrame([['a']], 'col1: string'))
        another_dataset = CardoDataFrame(
            self.context.spark.createDataFrame([[None]], 'col1: string'))
        result = RuntimeTestableStep(Test()).process(self.context, dataset,
                                                     another_dataset)
        result.dataframe.collect()
        self.assertEqual(2, self.get_pm_summary('count'))
        self.assertEqual(1, self.get_pm_summary('is_null'))
        self.assertEqual(5, len(self.log_handler.records))
Ejemplo n.º 21
0
    def test_unpersist_rdd(self):
        # Arrange
        rdd = self.context.spark.sparkContext.parallelize([Row(column1='aa')])
        second_rdd = self.context.spark.sparkContext.parallelize(
            [Row(column1='bb')])
        cardo_dataframe = CardoDataFrame(rdd, '')
        cardo_dataframe.persist()
        cardo_dataframe.rdd = second_rdd

        # Act
        cardo_dataframe.unpersist()

        # Assert
        self.assertFalse(rdd.is_cached)
Ejemplo n.º 22
0
    def test_unpersist_df(self):
        # Arrange
        df = self.context.spark.createDataFrame([['a']], 'column1: string')
        second_df = self.context.spark.createDataFrame([['b']],
                                                       'column1: string')
        cardo_dataframe = CardoDataFrame(df, '')
        cardo_dataframe.persist()
        cardo_dataframe.dataframe = second_df

        # Act
        cardo_dataframe.unpersist()

        # Assert
        self.assertFalse(df.is_cached)
Ejemplo n.º 23
0
    def test_step_with_unique_after_test_also_counts(self):
        class Test(IStep):
            def process(self, cardo_context, cardo_dataframe):
                return cardo_dataframe

            @IStep.pm_output()
            def is_null(self, value):
                return value is None

        dataset = CardoDataFrame(
            self.context.spark.createDataFrame([['a']], 'col1: string'))
        result = RuntimeTestableStep(Test()).process(self.context, dataset)
        result.dataframe.collect()
        self.assertEqual(1, self.get_pm_summary('count'))
        self.assertEqual(0, self.get_pm_summary('is_null'))
        self.assertEqual(3, len(self.log_handler.records))
Ejemplo n.º 24
0
	def process(self, cardo_context, cardo_dataframe=None):
		hash_column = ROWID
		if self.check_view and self._check_table_or_view(cardo_context):
			self.use_partitions = False
			hash_column = self._get_view_hash_column(cardo_context)
		query_frame = GET_ROWS_FOR_PARTITION_QUERY.format(table_owner=self.table_owner, table_name=self.table_name,
		                                                  where_clause=self.where_clause,
		                                                  select_clause=self.select_clause,
		                                                  table_divider_clause=TABLE_DIVIDER_CLAUSE.format(
			                                                  row_index=hash_column))
		partitions_name_num_rows_list = None
		if self.use_partitions:
			subpartitions_list = self._get_table_partitions(cardo_context, get_subpartitions=True)
			partitions_list = self._get_table_partitions(cardo_context, get_subpartitions=False)
			partitions_name_num_rows_list = subpartitions_list or partitions_list
			if subpartitions_list:
				query_frame = query_frame.format(use_partition=USE_SUBPARTITION)
			else:
				query_frame = query_frame.format(use_partition=USE_PARTITION if partitions_list else
				DONT_USE_PARTITION)
		else:
			query_frame = query_frame.format(use_partition=DONT_USE_PARTITION)
		if partitions_name_num_rows_list:
			readers = self._create_readers_by_partitions(partitions_name_num_rows_list, query_frame)
		else:
			readers = [OracleReader(query_frame.format(num_parallel=self.num_parallel, index=reader_index),
			                        self.connection_string, fetchsize=self.fetchsize) for reader_index in
			           range(self.num_parallel)]
		pool = Pool(min(len(readers) // 10, 100) + 1)
		prev_log_level = cardo_context.logger.level
		cardo_context.logger.setLevel('ERROR')
		dataframes = pool.map(lambda reader: reader.process(cardo_context).dataframe, readers)
		cardo_context.logger.setLevel(prev_log_level)
		df_united = OracleParallelReader.merge_reduce(lambda df1, df2: df1.union(df2), dataframes)
		cardo_context.logger.info(
			u'read data from Oracle from {table_owner}.{table_name} using OracleParallelReader successfully'.format(
				table_owner=self.table_owner, table_name=self.table_name))
		return CardoDataFrame(df_united, table_name=self.table_name)
Ejemplo n.º 25
0
def union_dataframes(*dataframes: Union[CardoDataFrame, List[CardoDataFrame],
                                        Tuple[CardoDataFrame]]):
    """
    :param dataframes:
    :return: union of all those dataframes
    """
    if isinstance(dataframes[0], list) or isinstance(dataframes[0], tuple):
        return union_dataframes(*[
            dataframe for many_dataframe in dataframes
            for dataframe in many_dataframe
        ])

    else:
        if dataframes[0].payload_type in ['dataframe', 'rdd']:
            unioned = functools.reduce(
                lambda df1, df2: CardoDataFrame(
                    df1.dataframe.union(
                        df2.dataframe.select(df1.dataframe.columns))),
                dataframes)
            return unioned
        if dataframes[0].payload_type == 'pandas':
            unioned = pd.concat(dataframes, axis=0)
            return unioned
Ejemplo n.º 26
0
    def test_step_with_unique_test_on_specific_column(self):
        class Test(IStep):
            def process(self, cardo_context, cardo_dataframe):
                cardo_dataframe.dataframe = cardo_dataframe.dataframe.where(
                    'col1 is not null')
                return cardo_dataframe

            @IStep.pm_input(['col1'])
            def is_null_before(self, value):
                return value is None

            @IStep.pm_output(['col1'])
            def is_null_after(self, value):
                return value is None

        dataset = CardoDataFrame(
            self.context.spark.createDataFrame([[1, 'a'], [2, None]],
                                               'num: int, col1: string'))
        result = RuntimeTestableStep(Test()).process(self.context, dataset)
        result.dataframe.collect()
        self.assertEqual(1, self.get_pm_summary('count'))
        self.assertEqual(1, self.get_pm_summary('is_null_before'))
        self.assertEqual(0, self.get_pm_summary('is_null_after'))
        self.assertEqual(5, len(self.log_handler.records))
Ejemplo n.º 27
0
 def test_created_with_rdd_returns_pandas_correctly(self):
     rdd = self.context.spark.sparkContext.parallelize([Row(column1='a')])
     cardo_dataframe = CardoDataFrame(rdd, '6')
     self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame)
     self.assertItemsEqual(rdd.collect()[0][0],
                           cardo_dataframe.pandas.values[0][0])
Ejemplo n.º 28
0
 def test_created_with_pandas_returns_pandas_correctly(self):
     pandas_df = self.context.spark.createDataFrame(
         [['a']], 'column1: string').toPandas()
     cardo_dataframe = CardoDataFrame(pandas_df)
     self.assertIsInstance(cardo_dataframe.pandas, pandas.DataFrame)
     self.assertTrue(pandas_df.equals(cardo_dataframe.pandas))
Ejemplo n.º 29
0
 def test_created_with_rdd_returns_rdd_correctly(self):
     rdd = self.context.spark.sparkContext.parallelize([Row(column1='a')])
     cardo_dataframe = CardoDataFrame(rdd, '6')
     self.assertIsInstance(cardo_dataframe.rdd, RDD)
     self.assertItemsEqual(rdd.collect(), cardo_dataframe.rdd.collect())
Ejemplo n.º 30
0
 def process(self, cardo_context: CardoContextBase, cardo_dataframe: CardoDataFrame=None) -> CardoDataFrame:
     if self.reader:
         logic_dataframe = self.reader.process(cardo_context, cardo_dataframe)
         return CardoDataFrame(logic_dataframe, snake_case(self.__class__.__name__))
     else:
         raise NotImplementedError