Example #1
0
def test_op_type_check_with_correct_type2():
    """
    With input type TYPE_TEXT, check if input_type and output_type are correct.
    """
    meta = TM({
        "tables": [
            {"fields": [{'name': 'col', 'type': TM.TYPE_TEXT}]}
        ]})
    op = FakeOp('col')
    meta2 = op.op_type_check(meta)
    assert meta2 == meta
    assert meta.get_type('col') == TM.TYPE_BOOL
    assert op.input_type == TM.TYPE_TEXT and op.output_type == TM.TYPE_BOOL
Example #2
0
def test_op_type_check_with_wrong_type():
    """
    with input type TYPE_IDENTIFIER, check if None is returned by op_type_check.
    """
    meta = TM({
        "tables": [
            {"fields": [{'name': 'col', 'type': TM.TYPE_IDENTIFIER}]}
        ]})
    op = FakeOp('col')
    meta2 = op.op_type_check(meta)
    assert meta2 is None
    assert meta.get_type('col') == TM.TYPE_IDENTIFIER
    assert op.input_type is TM.TYPE_IDENTIFIER and op.output_type is None
Example #3
0
def test_labeler_apply():
    entity_id_column = "taxi_id"
    time_column = "trip_id"
    label_generating_column = "fare"
    filter_column = "fare"
    table_meta = TableMeta.from_json(meta_json_str)
    labeler = Labeler()
    df = dataframe
    entity_to_data_dict = trane.df_group_by_entity_id(df, entity_id_column)
    entity_id_to_data_and_cutoff_dict = trane.ConstantCutoffTime(
        0, 0).generate_cutoffs(entity_to_data_dict, time_column)

    prediction_problem = PredictionProblem([
        AllFilterOp(label_generating_column),
        IdentityRowOp(label_generating_column),
        IdentityTransformationOp(label_generating_column),
        LastAggregationOp(label_generating_column)
    ])

    (is_valid_prediction_problem, filter_column_order_of_types, label_generating_column_order_of_types)\
        = prediction_problem.is_valid_prediction_problem(table_meta, filter_column, label_generating_column)

    filename = "prediction_problem.json"

    prediction_problems_to_json_file([prediction_problem], table_meta,
                                     entity_id_column, label_generating_column,
                                     time_column, filename)
    labeler.execute(entity_id_to_data_and_cutoff_dict, filename)
    os.remove(filename)
def test_order_of_operations():
    table_meta = TableMeta.from_json(meta_json_str)
    entity_id_column = "taxi_id"
    label_generating_column = "fare"
    time_column = "trip_id"
    filter_column = "taxi_id"
    ppg = PredictionProblemGenerator(table_meta, entity_id_column,
                                     label_generating_column, time_column,
                                     filter_column)

    logging.debug(
        "Dataframe in test_prediction_problem_generator.py: \n{}\n".format(
            dataframe))
    generator = ppg.generate(dataframe)

    problems = [prob for prob in generator]

    for problem in problems:
        ops = problem.operations
        assert (len(ops) == 4)
        first_op = ops[0]
        second_op = ops[1]
        third_op = ops[2]
        fourth_op = ops[3]

        assert (issubclass(first_op.__class__, FilterOpBase))
        assert (issubclass(second_op.__class__, RowOpBase))
        assert (issubclass(third_op.__class__, TransformationOpBase))
        assert (issubclass(fourth_op.__class__, AggregationOpBase))
Example #5
0
    def prep_for_integration(self):
        '''
        Creates a full fledged prediction problem generator without
        a mocked out ensure_valid_inputs method
        '''
        meta_json_str = ' \
            {"path": "", \
             "tables": [ \
                {"path": "synthetic_taxi_data.csv",\
                 "name": "taxi_data", \
                 "fields": [ \
                {"name": "vendor_id", "type": "id"},\
                {"name": "taxi_id", "type": "id"}, \
                {"name": "trip_id", "type": "datetime"}, \
                {"name": "distance", "type": "number", "subtype": "float"}, \
                {"name": "duration", "type": "number", "subtype": "float"}, \
                {"name": "fare", "type": "number", "subtype": "float"}, \
                {"name": "num_passengers", "type": "number", \
                    "subtype": "float"} \
                 ]}]}'

        self.table_meta = TableMeta.from_json(meta_json_str)
        self.df = pd.DataFrame([(0, 0, 0, 5.32, 19.7, 53.89, 1),
                                (0, 0, 1, 1.08, 6.78, 18.89, 2),
                                (0, 0, 2, 4.69, 14.11, 41.35, 4)],
                               columns=[
                                   "vendor_id", "taxi_id", "trip_id",
                                   "distance", "duration", "fare",
                                   "num_passengers"
                               ])

        self.generator = PredictionProblemGenerator(table_meta=self.table_meta,
                                                    entity_col=self.entity_col,
                                                    label_col=self.label_col,
                                                    filter_col=self.filter_col)
def test_write_then_read():
    table_meta = TableMeta.from_json(json_str)
    entity_id_column = "taxi_id"
    time_column = "trip_id"
    label_generating_column = "fare"
    prediction_problem = PredictionProblem([
        AllFilterOp(label_generating_column),
        IdentityRowOp(label_generating_column),
        IdentityTransformationOp(label_generating_column),
        LastAggregationOp(label_generating_column)
    ])
    filename = "prediction_problem.json"

    prediction_problems_to_json_file([prediction_problem], table_meta,
                                     entity_id_column, label_generating_column,
                                     time_column, filename)

    prediction_problems_from_json, table_meta_from_json, entity_id_column_from_json, \
        label_generating_column_from_json, time_column_from_json = prediction_problems_from_json_file(
            filename)
    prediction_problem_from_json = prediction_problems_from_json[0]

    os.remove(filename)

    assert (prediction_problem == prediction_problem_from_json)
    assert (entity_id_column == entity_id_column_from_json)
    assert (time_column == time_column_from_json)
    assert (label_generating_column == label_generating_column_from_json)
    assert (table_meta == table_meta_from_json)
def test_number_of_problems_generated():
    table_meta = TableMeta.from_json(meta_json_str)
    entity_id_column = "taxi_id"
    label_generating_column = "fare"
    time_column = "trip_id"
    filter_column = "taxi_id"
    ppg = PredictionProblemGenerator(table_meta, entity_id_column,
                                     label_generating_column, time_column,
                                     filter_column)
    generator = ppg.generate(dataframe)

    expected = 66  # THIS NUMBER WILL CHANGE IF MORE OPERATIONS ARE ADDED OR DECREASE BASED ON TYPE CHECKING
    found = len(list(generator))
    assert (expected == found)
def test_generated_types():
    table_meta = TableMeta.from_json(meta_json_str)
    entity_id_column = "taxi_id"
    label_generating_column = "fare"
    time_column = "trip_id"
    filter_column = "taxi_id"
    ppg = PredictionProblemGenerator(table_meta, entity_id_column,
                                     label_generating_column, time_column,
                                     filter_column)
    generator = ppg.generate(dataframe)

    expected = PredictionProblem
    problems = [prob for prob in generator]

    for problem in problems:
        found = problem
        assert (isinstance(found, expected))
def test_op_type_check():
    filter_column = "fare"
    label_generating_column = "fare"
    table_meta = TableMeta.from_json(json_str)

    prediction_problem_correct_types = PredictionProblem([AllFilterOp(filter_column),
                                                          IdentityRowOp(
        label_generating_column),
        IdentityTransformationOp(
        label_generating_column),
        LastAggregationOp(label_generating_column)])

    label_generating_column = "vendor_id"
    prediction_problem_incorrect_types = PredictionProblem([AllFilterOp(filter_column),
                                                            IdentityRowOp(
        label_generating_column),
        IdentityTransformationOp(
        label_generating_column),
        LMFAggregationOp(label_generating_column)])

    (
        correct_is_valid,
        filter_column_types_A,
        label_generating_column_types_A) = prediction_problem_correct_types.is_valid_prediction_problem(
        table_meta,
        filter_column,
        "fare")
    (
        incorrect_is_valid,
        filter_column_types_B,
        label_generating_column_types_B) = prediction_problem_incorrect_types.is_valid_prediction_problem(
        table_meta,
        filter_column,
        label_generating_column)

    assert(filter_column_types_A == ['float', 'float'])
    assert(label_generating_column_types_A == [
           'float', 'float', 'float', 'float'])

    assert(filter_column_types_B is None)
    assert(label_generating_column_types_B is None)

    assert(correct_is_valid)
    assert(not incorrect_is_valid)