Ejemplo n.º 1
0
def get_categorical_imputers(df, features):
    feature_defs = []
    for col_name in features:
        feature_defs.append((col_name, CategoricalImputer()))
    multi_imputer = DataFrameMapper(feature_defs, input_df=True, df_out=True)
    multi_imputer.fit(df[features])
    return multi_imputer
Ejemplo n.º 2
0
def mapper(df):
    x_mapper = DataFrameMapper([
        (u'action_type', LabelBinarizer()),
        (u'combined_shot_type', LabelBinarizer()),
        (u'loc_x', None),
        (u'loc_y', None),
        (u'minutes_remaining', None),
        (u'period', LabelBinarizer()),
        (u'playoffs', LabelBinarizer()),
        (u'seconds_remaining', None),
        (u'shot_distance', None),
        (u'shot_type', LabelBinarizer()),
        (u'shot_zone_area', LabelBinarizer()),
        (u'shot_zone_basic', LabelBinarizer()),
        (u'shot_zone_range', LabelBinarizer()),
        (u'matchup', LabelBinarizer()),
        (u'shot_id', None),
        (u'time_remaining', None),
        (u'opponent_num', LabelBinarizer()),
        (u'game_id_num', LabelBinarizer()),
    ])
    x_mapper.fit(df)
    y_mapper = DataFrameMapper([
        (u'shot_made_flag', None),
    ])
    y_mapper.fit(df)
    return x_mapper, y_mapper
def fg_categorical_categorical(train_categorical, test_categorical):
    # 列号到列名的映射
    # index_to_name = dict(zip(
    #     list(range(len(train_categorical.columns))),
    #     list(train_categorical.columns)
    # ))

    # 用常量代替 , 因为后续 train_categorical.columns 是会变化的
    columns_length = len(train_categorical.columns)
    columns_header = list(train_categorical.columns)
    for r in range(2, columns_length+1):
        # itertools.combinations column 的组合
        for combination in itertools.combinations(columns_header, r):
            train_categorical["_".join(combination)] = train_categorical[combination[0]].astype(str)
            test_categorical["_".join(combination)] = test_categorical[combination[0]].astype(str)
            for c in combination[1:]:
                train_categorical["_".join(combination)] = \
                    train_categorical["_".join(combination)] + "_" + train_categorical[c].astype(str)
                test_categorical["_".join(combination)] = \
                    test_categorical["_".join(combination)] + "_" + test_categorical[c].astype(str)

    # 对每一列使用 LabelEncoder 但是有个问题如上这种生成分类变量的方式容易使得测试集中某分类变量的水平测试集中没有出现报错
    mapper = DataFrameMapper([(i, LabelEncoder()) for i in train_categorical.columns])
    mapper.fit(train_categorical)
    train_categorical_new = pd.DataFrame(mapper.transform(train_categorical), columns=train_categorical.columns)
    test_categorical_new = pd.DataFrame(mapper.transform(test_categorical), columns=test_categorical.columns)

    return train_categorical_new, test_categorical_new
Ejemplo n.º 4
0
def mapper(df):
    df["time_remaining"] = df["minutes_remaining"] * 60 + df[
        "seconds_remaining"]
    x_mapper = DataFrameMapper([
        (u'loc_x', None), (u'loc_y', None), (u'minutes_remaining', None),
        (u'period', None), (u'seconds_remaining', None),
        (u'shot_distance', None), (u'playoffs', LabelBinarizer()),
        (u'action_type_num', None), (u'combined_shot_type_num', None),
        (u'season_num', None), (u'shot_type_num', None),
        (u'shot_zone_area_num', None), (u'shot_zone_basic_num', None),
        (u'shot_zone_range_num', None), (u'matchup', LabelBinarizer()),
        (u'shot_id', None), (u'opponent_num', None), (u'time_remaining', None),
        (u'last_moment', None), (u'hna', None), (u'action1', None),
        (u'action2', None), (u'action3', None), (u'action4', None),
        (u'action5', None), (u'action6', None), (u'action8', None),
        (u'action9', None), (u'action10', None), (u'action11', None),
        (u'action13', None), (u'action14', None), (u'action15', None),
        (u'action16', None)
    ])
    x_mapper.fit(df)
    y_mapper = DataFrameMapper([
        (u'shot_made_flag', None),
    ])
    y_mapper.fit(df)
    return x_mapper, y_mapper
def get_mapper(data_all):
    param_list = [
        ('id', None), ('major', LabelEncoder()), ('age', None),
        ('gender', LabelEncoder()), ('isenglish', None), ('isjunior', None),
        ('isbachelor', None), ('ismaster', None), ('isintern', None),
        ('total_previous_job', None), ('last_type', LabelEncoder()),
        ('last_type1', LabelEncoder()), ('last_department', LabelEncoder()),
        ('last_size', None), ('last_salary', None),
        ('last_industry', LabelEncoder()),
        ('last_position_name', LabelEncoder()), ('last_start_year', None),
        ('last_start_month', None), ('last_end_year', None),
        ('last_end_month', None), ('last_interval_month', None),
        ('third_type', LabelEncoder()), ('third_type1', LabelEncoder()),
        ('third_department', LabelEncoder()), ('third_size', None),
        ('third_salary', None), ('third_industry', LabelEncoder()),
        ('third_position_name', LabelEncoder()), ('third_start_year', None),
        ('third_start_month', None), ('third_end_year', None),
        ('third_end_month', None), ('third_interval_month', None),
        ('first_type', LabelEncoder()), ('first_type1', LabelEncoder()),
        ('first_department', LabelEncoder()), ('first_size', None),
        ('first_salary', None), ('first_industry', LabelEncoder()),
        ('first_position_name', LabelEncoder()), ('first_start_year', None),
        ('first_start_month', None), ('first_end_year', None),
        ('first_end_month', None), ('first_interval_month', None),
        ('last3_interval_month', None), ('diff_last3_salary', LabelEncoder()),
        ('diff_last3_size', LabelEncoder()),
        ('diff_last3_industry', LabelEncoder()),
        ('diff_last3_position_name', LabelEncoder()),
        ('total_interval_month', None), ('diff_salary', LabelEncoder()),
        ('diff_size', LabelEncoder()), ('diff_industry', LabelEncoder()),
        ('diff_position_name', LabelEncoder()), ('major_1', LabelEncoder()),
        ('last_position_name_1', LabelEncoder()),
        ('last_department_1', LabelEncoder()),
        ('third_position_name_1', LabelEncoder()),
        ('third_department_1', LabelEncoder()),
        ('first_position_name_1', LabelEncoder()),
        ('first_department_1', LabelEncoder()), ('major_2', LabelEncoder()),
        ('last_position_name_2', LabelEncoder()),
        ('last_department_2', LabelEncoder()),
        ('third_position_name_2', LabelEncoder()),
        ('third_department_2', LabelEncoder()),
        ('first_position_name_2', LabelEncoder()),
        ('first_department_2', LabelEncoder()), ('start_working_age', None),
        ('rev_working_age', None), ('pre_working_month', None),
        ('pre_interval_month', None), ("pre_largest_size", None),
        ("pre_largest_salary", None), ("pre_least_size", None),
        ("pre_least_salary", None), ("pre_size1", None), ("pre_size2", None),
        ("pre_size3", None), ("pre_size4", None), ("pre_size5", None),
        ("pre_size6", None), ("pre_size7", None), ("pre_salary1", None),
        ("pre_salary2", None), ("pre_salary3", None), ("pre_salary4", None),
        ("pre_salary5", None), ("pre_salary6", None), ("pre_salary7", None),
        ("promotion_size", None), ("promotion_salary", None),
        ("decrease_size", None), ("decrease_salar", None)
    ]
    print "the mapper's param list is %s" % (len(param_list))
    mapper = DataFrameMapper(param_list)
    mapper.fit(data_all)
    return mapper
Ejemplo n.º 6
0
def test_fit_with_optional_y_arg(complex_dataframe):
    """
    Transformers with an optional y argument in the fit method
    are handled correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
    # doesn't fail
    mapper.fit(df[['feat1', 'feat2']], df['target'])
Ejemplo n.º 7
0
def test_fit_with_optional_y_arg(complex_dataframe):
    """
    Transformers with an optional y argument in the fit method
    are handled correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
    # doesn't fail
    mapper.fit(df[['feat1', 'feat2']], df['target'])
    def pre_processing(self):
        self.__numeric_header = [
            i for i in self.__train_feature.columns
            if i not in self.__categorical_header
        ]

        self.__train_categorical = self.__train_feature[
            self.__categorical_header]
        self.__train_numeric = self.__train_feature[self.__numeric_header]

        self.__validation_categorical = self.__validation_feature[
            self.__categorical_header]
        self.__validation_numeric = self.__validation_feature[
            self.__numeric_header]

        self.__test_categorical = self.__test_feature[
            self.__categorical_header]
        self.__test_numeric = self.__test_feature[self.__numeric_header]

        self.__train_categorical = self.__train_categorical.astype(str)
        self.__validation_categorical = self.__validation_categorical.astype(
            str)
        self.__test_categorical = self.__test_categorical.astype(str)

        self.__train_categorical = self.__train_categorical.fillna("missing")
        self.__validation_categorical = self.__validation_categorical.fillna(
            "missing")
        self.__test_categorical = self.__test_categorical.fillna("missing")

        # 使用 DataFrameMapper 生成的 DataFrame 舍弃了之前 DataFrame 的 index 需要 set_index
        mapper = DataFrameMapper([(i, LabelEncoder())
                                  for i in self.__train_categorical.columns])
        mapper.fit(self.__train_categorical)
        self.__train_categorical = pd.DataFrame(
            mapper.transform(self.__train_categorical),
            columns=self.__train_categorical.columns).set_index(
                self.__train_categorical.index)
        self.__validation_categorical = pd.DataFrame(
            mapper.transform(self.__validation_categorical),
            columns=self.__validation_categorical.columns).set_index(
                self.__validation_categorical.index)
        self.__test_categorical = pd.DataFrame(
            mapper.transform(self.__test_categorical),
            columns=self.__test_categorical.columns).set_index(
                self.__test_categorical.index)

        self.__train_numeric = self.__train_numeric.fillna(-999)
        self.__validation_numeric = self.__validation_numeric.fillna(-999)
        self.__test_numeric = self.__test_numeric.fillna(-999)

        self.__train_feature = pd.concat(
            [self.__train_categorical, self.__train_numeric], axis=1).values
        self.__validation_feature = pd.concat(
            [self.__validation_categorical, self.__validation_numeric],
            axis=1).values
        self.__test_feature = pd.concat(
            [self.__test_categorical, self.__test_numeric], axis=1).values
Ejemplo n.º 9
0
def transform_cat_to_cont(df, cat_features, cont_features):
    feature_defs = []
    for col_name in cat_features:
        feature_defs.append((col_name, MyLabelBinarizer()))

    for col_name in cont_features:
        feature_defs.append((col_name, None))

    mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True)
    mapper.fit(df)
    return mapper.transform(df)
Ejemplo n.º 10
0
def test_cols_list_column_vector(simple_dataframe):
    """
    If a one-element list is specified as the columns, the transformer
    is called with a column vector as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mapper = DataFrameMapper([(["a"], mock_transformer)])

    mapper.fit(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3, 1)
Ejemplo n.º 11
0
def test_cols_string_array(simple_dataframe):
    """
    If a string is specified as the columns, the transformer
    is called with a 1-d array as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mapper = DataFrameMapper([("a", mock_transformer)])

    mapper.fit(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3,)
Ejemplo n.º 12
0
def test_cols_list_column_vector(simple_dataframe):
    """
    If a one-element list is specified as the columns, the transformer
    is called with a column vector as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mapper = DataFrameMapper([(["a"], mock_transformer)])

    mapper.fit(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3, 1)
Ejemplo n.º 13
0
def test_cols_string_array(simple_dataframe):
    """
    If a string is specified as the columns, the transformer
    is called with a 1-d array as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mapper = DataFrameMapper([("a", mock_transformer)])

    mapper.fit(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3, )
Ejemplo n.º 14
0
def test_numerical_transformer_serialization(simple_dataset):
    """
    Test if you can serialize transformer
    """
    transfomer = DataFrameMapper([('feat1', NumericalTransformer('log'))])

    df = simple_dataset
    transfomer.fit(df)
    f = tempfile.NamedTemporaryFile(delete=True)
    joblib.dump(transfomer, f.name)
    transfomer2 = joblib.load(f.name)
    np.array_equal(transfomer.transform(df), transfomer2.transform(df))
    f.close()
Ejemplo n.º 15
0
def test_exception_column_context_fit(simple_dataframe):
    """
    If an exception is raised when fit a column,
    the exception includes the name of the column being fitted
    """
    class FailingFitter(object):
        def fit(self, X):
            raise Exception('Some exception')

    df = simple_dataframe
    mapper = DataFrameMapper([('a', FailingFitter())])

    with pytest.raises(Exception, match='a: Some exception'):
        mapper.fit(df)
Ejemplo n.º 16
0
    def test_mapper(self):
        data = lib.load_titanic()

        transformation_list = [(['name'],
                                [EmbeddingVectorizer(max_sequence_length=12)])]

        mapper = DataFrameMapper(transformation_list, df_out=True)

        mapper.fit(data)

        data_transformed = mapper.transform(data)

        assert_array_equal([2, 3, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1],
                           data_transformed.values[0, :])
Ejemplo n.º 17
0
def test_exception_column_context_fit(simple_dataframe):
    """
    If an exception is raised when fit a column,
    the exception includes the name of the column being fitted
    """
    class FailingFitter(object):
        def fit(self, X):
            raise Exception('Some exception')

    df = simple_dataframe
    mapper = DataFrameMapper([('a', FailingFitter())])

    with pytest.raises(Exception, match='a: Some exception'):
        mapper.fit(df)
Ejemplo n.º 18
0
    def transform_features(self):
        totransform = []
        for index, item in enumerate(self.feat_head):
            field = item[0]
            func_name = item[1]
            transform = item[2]
            is_enable = item[3]

            if is_enable:
                if not field in self.stumble_data.get_features():
                    print 'field not in feature..generating:' + field
                    func_name(field)
                totransform.append((field, transform))

        if len(totransform):
            mapper = DataFrameMapper(totransform)
            mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
            #
            X_transformed_train = mapper.transform(
                self.stumble_data.all_pd[:self.stumble_data.len_train])
            X_transformed_test = mapper.transform(
                self.stumble_data.all_pd[self.stumble_data.len_train:])

            for index, item in enumerate(self.feat_head):
                field = item[0]
                is_enable = item[3]
                if is_enable and field in self.stumble_data.get_features():
                    del self.stumble_data.all_pd[field]

            import pdb
            pdb.set_trace()

            from scipy.sparse import hstack

            X_train = X_transformed_train
            X_test = X_transformed_test
            y_train = self.stumble_data.all_pd[:self.stumble_data.
                                               len_train]['label']
            #            print 'Dumping train in SVMLight.'
            dump_svmlight_file(X_train, y_train, output_train_libsvm_file)


#            print 'Dumping test in SVMLight.'
#            dump_svmlight_file(X_test, pred, output_test_libsvm_file )

        else:
            X_train = X_train.as_matrix()
            X_test = X_test.as_matrix()

        return X_train, y_train, X_test
class Preprocessor:
    mapper: DataFrameMapper

    def __init__(self):
        self.mapper = DataFrameMapper([(encoding_fields, [
            SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder()
        ]), (scale_fields, preprocessing.StandardScaler())])

    def train(self, x: pd.DataFrame):
        self.mapper.fit(x)

    def transform(self, x: pd.DataFrame):
        return self.mapper.transform(x)
Ejemplo n.º 20
0
   def transform_features(self):
       totransform = []
       for index, item in enumerate(self.feat_head):
           field = item[0]
           func_name = item[1]
           transform = item[2]
           is_enable = item[3]

           if is_enable:
               if not field in self.stumble_data.get_features():
                   print 'field not in feature..generating:' +  field
                   func_name(field)
               totransform.append((field, transform))

       if len(totransform):
           mapper = DataFrameMapper(totransform)
           mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train])
           #
           X_transformed_train = mapper.transform(
               self.stumble_data.all_pd[:self.stumble_data.len_train])
           X_transformed_test = mapper.transform(
               self.stumble_data.all_pd[self.stumble_data.len_train:])

           for index, item in enumerate(self.feat_head):
               field = item[0]
               is_enable = item[3]
               if is_enable and field in self.stumble_data.get_features():
                   del self.stumble_data.all_pd[field]

           import pdb
           pdb.set_trace()

           from scipy.sparse import hstack

           X_train = X_transformed_train
           X_test = X_transformed_test
           y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label']
#            print 'Dumping train in SVMLight.'
           dump_svmlight_file(X_train, y_train, output_train_libsvm_file )

#            print 'Dumping test in SVMLight.'
#            dump_svmlight_file(X_test, pred, output_test_libsvm_file )

       else:
           X_train = X_train.as_matrix()
           X_test = X_test.as_matrix()


       return X_train, y_train, X_test
Ejemplo n.º 21
0
def test_build_features_old_unpickle(simple_dataframe):
    """
    Fitted mappers pickled before the built_features and built_default
    attributes can correctly transform
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)])
    mapper.fit(df)

    # simulate the mapper was pickled before the attributes existed
    del mapper.built_features
    del mapper.built_default

    mapper_pickled = pickle.dumps(mapper)
    loaded_mapper = pickle.loads(mapper_pickled)
    loaded_mapper.transform(simple_dataframe)  # doesn't fail
Ejemplo n.º 22
0
def test_build_features_old_unpickle(simple_dataframe):
    """
    Fitted mappers pickled before the built_features and built_default
    attributes can correctly transform
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)])
    mapper.fit(df)

    # simulate the mapper was pickled before the attributes existed
    del mapper.built_features
    del mapper.built_default

    mapper_pickled = pickle.dumps(mapper)
    loaded_mapper = pickle.loads(mapper_pickled)
    loaded_mapper.transform(simple_dataframe)  # doesn't fail
def train_fn(args):
    print("loading data")
    train_df = pd.read_csv(args.train_data + "/train.csv", engine='python')
    test_df = pd.read_csv(args.test_data + "/test.csv", engine='python')

    TARGET = 'SeriousDlqin2yrs'
    X_train = train_df.drop(TARGET, axis=1)
    y_train = train_df[TARGET]
    X_test = test_df.drop(TARGET, axis=1)
    y_test = test_df[TARGET]

    print("Imputing missing values")
    transformer = DataFrameMapper([
        (['MonthlyIncome'], DFImputer(strategy="constant", fill_value=-1)),
        (['age'], DFImputer(strategy="median")),
        (['NumberOfDependents'], DFImputer(strategy="median")),
        (['DebtRatio'], DFImputer(strategy="median")),
        (['RevolvingUtilizationOfUnsecuredLines'
          ], DFImputer(strategy="median")),
        (['NumberRealEstateLoansOrLines'], DFImputer(strategy="median")),
        (['NumberOfOpenCreditLinesAndLoans'], DFImputer(strategy="median")),
        (['NumberOfTime30-59DaysPastDueNotWorse'
          ], DFImputer(strategy="median")),
        (['NumberOfTime60-89DaysPastDueNotWorse'
          ], DFImputer(strategy="median")),
        (['NumberOfTimes90DaysLate'], DFImputer(strategy="median")),
    ],
                                  input_df=True,
                                  df_out=True)
    transformer.fit(X_train)
    X_train = transformer.transform(X_train)
    X_test = transformer.transform(X_test)

    print("Building model...")
    model = RandomForestClassifier(n_estimators=50,
                                   max_depth=6,
                                   max_leaf_nodes=30)
    model.fit(X_train, y_train)
    explainer = shap.TreeExplainer(model)

    print("Saving artifacts...")
    model_dir = Path(args.model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)

    joblib.dump(transformer, open(str(model_dir / "transformer.joblib"), "wb"))
    joblib.dump(model, open(str(model_dir / "model.joblib"), "wb"))
    joblib.dump(explainer, open(str(model_dir / "explainer.joblib"), "wb"))
Ejemplo n.º 24
0
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
    """
    If input_df is True, the subsequent transformers get passed pandas
    objects instead of numpy arrays (given the previous transformers
    output pandas objects as well)
    """
    df = simple_dataframe
    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
    monkeypatch.setattr(MockTClassifier, 'transform',
                        Mock(return_value=pd.Series([1, 2, 3])))
    mapper = DataFrameMapper(
        [('a', [MockXTransformer(), MockTClassifier()])], input_df=True)
    mapper.fit(df)
    out = mapper.transform(df)

    args, _ = MockTClassifier().fit.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
Ejemplo n.º 25
0
def test_fit_transform_equiv_mock(simple_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper using the mock
    transformer which does not implement a custom fit_transform.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', MockXTransformer())])
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.all(transformed_combined == transformed_separate)
Ejemplo n.º 26
0
def test_fit_transform_equiv_mock(simple_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper using the mock
    transformer which does not implement a custom fit_transform.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', MockXTransformer())])
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.all(transformed_combined == transformed_separate)
Ejemplo n.º 27
0
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
    """
    If input_df is True, the subsequent transformers get passed pandas
    objects instead of numpy arrays (given the previous transformers
    output pandas objects as well)
    """
    df = simple_dataframe
    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
    monkeypatch.setattr(MockTClassifier, 'transform',
                        Mock(return_value=pd.Series([1, 2, 3])))
    mapper = DataFrameMapper([
        ('a', [MockXTransformer(), MockTClassifier()])
    ], input_df=True)
    mapper.fit(df)
    out = mapper.transform(df)

    args, _ = MockTClassifier().fit.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
Ejemplo n.º 28
0
def encode_cat_var(df, cat_var_dict):
    """Encode categorical features : return the fit """

    cat_vars = [
        o[0] for o in sorted(
            cat_var_dict.items(), key=operator.itemgetter(1), reverse=True)
    ]
    cat_maps = [(o, LabelEncoder()) for o in cat_vars]
    cat_mapper = DataFrameMapper(cat_maps)
    cat_map_fit = cat_mapper.fit(df)

    return cat_map_fit
Ejemplo n.º 29
0
def extract_features(dataset1, vect, prom_filename1, prom_filename2):
    #Adding 3 more features tf-idf_vector and relevance  
    dataset1['relevance'] = np.nan
    dataset1['relevance_quesn'] = np.nan
    dataset1['word_count'] = np.nan
    #Adding tf-idf vectors as features
    #currently set to [essay] , to be transformed using mapper
    #making a list since tf-idf needs an iterable over text
    dataset1['tf-idf'] = dataset1['essay']
    tokenizer_words = TweetTokenizer()
    dataset1['word_count'] = list(map(len,dataset1['essay'].apply(tokenizer_words.tokenize)))
    dataset1['distinct_word_count'] = list(map(len,list(map(set,dataset1['essay'].apply(tokenizer_words.tokenize)))))
    dataset1['sentence_count'] = list(map(len,dataset1['essay'].apply(nltk.sent_tokenize)))
    dataset1['charslength'] = list(map(len,list(map("".join,dataset1['essay'].apply(tokenizer_words.tokenize)))))
    dataset1['avg_word_length'] = np.divide(dataset1['charslength'],dataset1['word_count'])
  
    #Calculating similarity between prompt and essay and adding a feature relevance
    
    
    prm1_vect, prm1_quesn_vect = extract_prompt(prom_filename1, prom_filename2,vect)
    
    essay_vect = vect.transform(dataset1['essay'])
    sim1 = cosine_similarity(essay_vect, prm1_vect) #don't compute element wise , broadcasting reduces time
    sim2 = cosine_similarity(essay_vect, prm1_quesn_vect)
    dataset1['relevance']= sim1
    dataset1['relevance_quesn'] = sim2
   
   #note- Dataframemapper gives nonetype not iterable error if is not fitted to data.
   #It is necessary to fit mapper before transforming, otherwise it recieves no data and 
   #hence the nonetype error.
    
    from sklearn_pandas import DataFrameMapper
    mapper = DataFrameMapper([
        # ('tf-idf', Vectorize(vect),{'input_df': True}), #transforming tf-idf using vect
         ('relevance', None), #no transformation required
         ('relevance_quesn',None), #no transformation required
         ('essay_set',None),
         ('word_count',None),
         ('distinct_word_count',None),
         ('sentence_count', None),
         ('avg_word_length',None),

         #if we include essay then it would cause error in final stacking of colums as 
         #tf-idf is sparse and text cannot be converted to sparse format 
         #Also essay text is not required
        
     ])
    #creating features combinining tf-idf and relevance features
    mapp = mapper.fit(dataset1) #applies individual column transformation as defined in mapper

    features = mapp.transform(dataset1)
    
    return features
    def pre_processing(self):
        self.__numeric_header = [i for i in self.__train_feature.columns if i not in self.__categorical_header]
        self.__train_categorical = self.__train_feature[self.__categorical_header]
        self.__train_numeric = self.__train_feature[self.__numeric_header]
        self.__test_categorical = self.__test_feature[self.__categorical_header]
        self.__test_numeric = self.__test_feature[self.__numeric_header]

        self.__train_categorical = self.__train_categorical.astype(str)
        self.__test_categorical = self.__test_categorical.astype(str)
        self.__train_categorical = self.__train_categorical.fillna("missing")
        self.__test_categorical = self.__test_categorical.fillna("missing")
        mapper = DataFrameMapper([(i, LabelEncoder()) for i in self.__train_categorical.columns])
        mapper.fit(self.__train_categorical)
        self.__train_categorical = pd.DataFrame(mapper.transform(self.__train_categorical), columns=self.__train_categorical.columns)
        self.__test_categorical = pd.DataFrame(mapper.transform(self.__test_categorical), columns=self.__test_categorical.columns)

        self.__train_numeric = self.__train_numeric.fillna(-999)
        self.__test_numeric = self.__test_numeric.fillna(-999)

        self.__train_feature = pd.concat([self.__train_numeric, self.__train_categorical], axis=1)
        self.__test_feature = pd.concat([self.__test_numeric, self.__test_categorical], axis=1)
        self.__train_feature = self.__train_feature.values
        self.__test_feature = self.__test_feature.values
class MyMapper():
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.ncols = []
        self.scols = []
        #         print("mapping features")
        for col in X:
            if X[col].dtype == float:
                # print("numerical col: %s" % col)
                self.ncols.append([col])
            else:
                # print("categorical col: %s" % col)
                self.scols.append([col])
        nfeats = gen_features(columns=self.ncols,
                              classes=[{
                                  'class':
                                  sklearn.preprocessing.MinMaxScaler,
                              }])
        sfeats = gen_features(columns=self.scols,
                              classes=[{
                                  'class': LabelBinarizer2
                              }])
        self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True)
        self.mapper.fit(X)
        #         print("features mapped")
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = self.mapper.transform(X)
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
Ejemplo n.º 32
0
def type_map(data_df, cat_vars, contin_vars, cache_dir, use_cache=True):
    " type_map "
    if use_cache:
        cat_map_fit = pickle.load(
            open('{}/cat_maps.pickle'.format(cache_dir), 'rb'))
        contin_map_fit = pickle.load(
            open('{}/contin_maps.pickle'.format(cache_dir), 'rb'))
    else:
        cat_maps = [(o, LabelEncoder()) for o in cat_vars]
        contin_maps = [([o], StandardScaler()) for o in contin_vars]
        cat_mapper = DataFrameMapper(cat_maps)
        cat_map_fit = cat_mapper.fit(data_df)
        contin_mapper = DataFrameMapper(contin_maps)
        contin_map_fit = contin_mapper.fit(data_df)
        pickle.dump(contin_map_fit,
                    open('{}/contin_maps.pickle'.format(cache_dir), 'wb'))
        pickle.dump(cat_map_fit,
                    open('{}/cat_maps.pickle'.format(cache_dir), 'wb'))
    cat_cols = len(cat_map_fit.features)
    contin_cols = len(contin_map_fit.features)
    logger.info("cat_cols: {}, contin_cols: {}".format(cat_cols, contin_cols))
    logger.info("cat_map_fit.features: {}".format(
        [len(o[1].classes_) for o in cat_map_fit.features]))
    return cat_map_fit, contin_map_fit, contin_cols
Ejemplo n.º 33
0
def test_fit_transform_equiv_pca(complex_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper and transformer
    using PCA which implements a custom fit_transform. The
    equivalence of both paths in the transformer only can be
    asserted since this is tested in the sklearn tests
    scikit-learn/sklearn/decomposition/tests/test_pca.py
    """
    df = complex_dataframe
    mapper = DataFrameMapper(
        [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], df_out=True)
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.allclose(transformed_combined, transformed_separate)
Ejemplo n.º 34
0
def test_fit_transform_equiv_pca(complex_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper and transformer
    using PCA which implements a custom fit_transform. The
    equivalence of both paths in the transformer only can be
    asserted since this is tested in the sklearn tests
    scikit-learn/sklearn/decomposition/tests/test_pca.py
    """
    df = complex_dataframe
    mapper = DataFrameMapper(
        [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
        df_out=True)
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.allclose(transformed_combined, transformed_separate)
Ejemplo n.º 35
0
def test_fit_with_required_y_arg(complex_dataframe):
    """
    Transformers with a required y argument in the fit method
    are handled and perform correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

    # fit, doesn't fail
    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

    # fit_transform
    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
    assert_array_equal(ft_arr, df[['feat1']].values)

    # transform
    t_arr = mapper.transform(df[['feat1', 'feat2']])
    assert_array_equal(t_arr, df[['feat1']].values)
Ejemplo n.º 36
0
def test_fit_with_required_y_arg(complex_dataframe):
    """
    Transformers with a required y argument in the fit method
    are handled and perform correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

    # fit, doesn't fail
    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

    # fit_transform
    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
    assert_array_equal(ft_arr, df[['feat1']].values)

    # transform
    t_arr = mapper.transform(df[['feat1', 'feat2']])
    assert_array_equal(t_arr, df[['feat1']].values)
Ejemplo n.º 37
0
def test_make_column_selector(iris_dataframe):
    t = DataFrameMapper([
        (make_column_selector(dtype_include=float), None, {
            'alias': 'x'
        }),
        ('sepal length (cm)', None),
    ],
                        df_out=True,
                        default=False)

    xt = t.fit(iris_dataframe).transform(iris_dataframe)
    expected = ['x_0', 'x_1', 'x_2', 'x_3', 'sepal length (cm)']
    assert list(xt.columns) == expected

    pickled = pickle.dumps(t)
    t2 = pickle.loads(pickled)
    xt2 = t2.transform(iris_dataframe)
    assert np.array_equal(xt.values, xt2.values)
Ejemplo n.º 38
0
    def Standard_(self, dfa, scale = 'S'):
        Scalers = {
            'S' : StandardScaler(),
            'R' : RobustScaler(quantile_range=tuple(self.arg.QuantileRange)),
            'M' : MinMaxScaler(),
            'MA': MaxAbsScaler(),
            'OE': OrdinalEncoder(),
            'OH': OneHotEncoder(),
            'NL' : Normalizer(),
            'QT': QuantileTransformer(),
            'PT': PowerTransformer(),
            'N' : FunctionTransformer( validate=False ),
        }
        Sca_map = [Scalers[i] for i in scale]
        Xa = list( dfa.columns )

        mapper = DataFrameMapper([ ( Xa, Sca_map ) ])
        clfit = mapper.fit( dfa )

        self.log.CIF('Standardization Pocessing'.center(45, '-'))
        self.log.NIF('Scale paramaters:\n%s' %clfit)
        self.log.CIF(45 * '-')

        return clfit
Ejemplo n.º 39
0
def create_mapper(df, cat_vars=list(), cont_vars=list(), date_vars=list(), no_transform_vars=list(), response_vars=list()):
    logging.info('Creating mapper')

    # TODO Add support for datetime variables

    # Reference variables
    transformation_list = list()

    # Copy df, to avoid 'inplace' transformation
    df = df.copy(deep=True)

    # TODO Check if any df variables are not listed in cat_vars or cont_vars. If so, remove them.

    # Check if any df variables are listed in cat_vars and cont_vars. If so, raise an error.
    intersection = filter(lambda x: x in cat_vars, cont_vars)
    if len(intersection) > 0:
        raise AssertionError('Columns appear in both cat_vars and cont_vars: {}'.format(intersection))

    # Convert continuous variables to float32
    for cont_var in cont_vars + response_vars:
        logging.debug('Converting cont_var data type: {}'.format(cont_var))
        df[cont_var] = df[cont_var].astype(numpy.float32)

    for date_var in date_vars:
        logging.info('Enriching for datetime var: {}'.format(date_var))
        df, date_cat_vars, date_cont_vars = add_datetime_vars(df, date_var)
        cat_vars.extend(date_cat_vars)
        cont_vars.extend(date_cont_vars)

    # Add continuous variable transformations for cont_vars
    for cont_var in cont_vars + response_vars:
        logging.debug('Creating transformation list for cont_var: {}'.format(cont_var))
        transformations = [Imputer(strategy='mean'), StandardScaler()]
        var_tuple = ([cont_var], transformations)
        transformation_list.append(var_tuple)

    # Add categorical variable transformations for cat_vars
    for cat_var in cat_vars:
        logging.debug('Creating transformation list for cat_var: {}'.format(cat_var))
        # TODO Replace LabelEncoder with CategoricalEncoder, to better handle unseen cases
        transformations = [LabelEncoder()]
        var_tuple = (cat_var, transformations)
        transformation_list.append(var_tuple)

    for no_transform_var in no_transform_vars:
        logging.debug('Creating transformation list for cont_var: {}'.format(no_transform_var))
        transformations = [Imputer(strategy='most_frequent')]
        var_tuple = ([no_transform_var], transformations)
        transformation_list.append(var_tuple)

    # Create mapper
    logging.info('Creating mapper')
    mapper = DataFrameMapper(features=transformation_list, df_out=True)

    # Train mapper
    logging.info('Training newly created mapper')
    mapper.fit(df)

    # Throw away transformation, to set up mapper
    logging.info('Transforming data set with newly created mapper, to initialize mapper internals')
    mapper.transform(df.sample(1000))

    return mapper
Ejemplo n.º 40
0
    (["CompetitionOpen"], LabelBinarizer()),
    (["WillClosedTomorrow_TodayIsSat"], LabelBinarizer()),
    (["WillClosedTomorrow_TodayIsNotSat"], LabelBinarizer()),
    (["WasClosedYesterday_TodayIsMon"], LabelBinarizer()),
    (["WasClosedYesterday_TodayIsNotMon"], LabelBinarizer()),
    (["SchoolHoliday"], LabelBinarizer()), (["StateHoliday"], OneHotEncoder()),
    (["Year"], OneHotEncoder()), (["Month"], OneHotEncoder()),
    (["Tenday"], OneHotEncoder()), (["Day"], OneHotEncoder()),
    (["DayStr"], OneHotEncoder()), (["DayOfWeek"], OneHotEncoder()),
    (["WeekOfYearStr"], OneHotEncoder()),
    (["DayOfYearOutlier"], OneHotEncoder()),
    (["DayOfYearSlopeStr"], OneHotEncoder())
],
                         default=False)

mapper_fit = mapper.fit(df_train)
df_train_transform = mapper_fit.transform(df_train)
df_valid_transform = mapper_fit.transform(df_valid)
df_test_transform = mapper_fit.transform(df_test)

x_train = df_train_transform[:, 1:]
y_train = df_train_transform[:, 0]
y_train = np.log1p(y_train)

x_valid = df_valid_transform[:, 1:]
y_valid = df_valid_transform[:, 0]
y_valid = np.log1p(y_valid)

x_test = df_test_transform[:, 1:]

# 模型训练、验证、测试
def get_mapper(data_all):
    param_list = [
        ('id', None),
        ('major', LabelEncoder()),
        ('age', None),
        ('gender', LabelEncoder()),
        ('isenglish', None),
        ('isjunior', None),
        ('isbachelor', None),
        ('ismaster', None),
        ('isintern', None),
        ('total_previous_job', None),
        ('last_type', LabelEncoder()),
        ('last_type1', LabelEncoder()),
        ('last_department', LabelEncoder()),
        ('last_size', None),
        ('last_salary', None),
        ('last_industry', LabelEncoder()),
        ('last_position_name', LabelEncoder()),
        ('last_start_year', None),
        ('last_start_month', None),
        ('last_end_year', None),
        ('last_end_month', None),
        ('last_interval_month', None),
        ('third_type', LabelEncoder()),
        ('third_type1', LabelEncoder()),
        ('third_department', LabelEncoder()),
        ('third_size', None),
        ('third_salary', None),
        ('third_industry', LabelEncoder()),
        ('third_position_name', LabelEncoder()),
        ('third_start_year', None),
        ('third_start_month', None),
        ('third_end_year', None),
        ('third_end_month', None),
        ('third_interval_month', None),
        ('first_type', LabelEncoder()),
        ('first_type1', LabelEncoder()),
        ('first_department', LabelEncoder()),
        ('first_size', None),
        ('first_salary', None),
        ('first_industry', LabelEncoder()),
        ('first_position_name', LabelEncoder()),
        ('first_start_year', None),
        ('first_start_month', None),
        ('first_end_year', None),
        ('first_end_month', None),
        ('first_interval_month', None),
        ('last3_interval_month', None),
        ('diff_last3_salary', LabelEncoder()),
        ('diff_last3_size', LabelEncoder()),
        ('diff_last3_industry', LabelEncoder()),
        ('diff_last3_position_name', LabelEncoder()),
        ('total_interval_month', None),
        ('diff_salary', LabelEncoder()),
        ('diff_size', LabelEncoder()),
        ('diff_industry', LabelEncoder()),
        ('diff_position_name', LabelEncoder()),
        ('major_1', LabelEncoder()),
        ('last_position_name_1', LabelEncoder()),
        ('last_department_1', LabelEncoder()),
        ('third_position_name_1', LabelEncoder()),
        ('third_department_1', LabelEncoder()),
        ('first_position_name_1', LabelEncoder()),
        ('first_department_1', LabelEncoder()),
        ('major_2', LabelEncoder()),
        ('last_position_name_2', LabelEncoder()),
        ('last_department_2', LabelEncoder()),
        ('third_position_name_2', LabelEncoder()),
        ('third_department_2', LabelEncoder()),
        ('first_position_name_2', LabelEncoder()),
        ('first_department_2', LabelEncoder()),
        ('start_working_age', None),
        ('rev_working_age', None),
        ('pre_working_month', None),
        ('pre_interval_month', None),
        ("pre_largest_size", None),
        ("pre_largest_salary", None),
        ("pre_least_size", None),
        ("pre_least_salary", None),
        ("pre_size1", None),
        ("pre_size2", None),
        ("pre_size3", None),
        ("pre_size4", None),
        ("pre_size5", None),
        ("pre_size6", None),
        ("pre_size7", None),
        ("pre_salary1", None),
        ("pre_salary2", None),
        ("pre_salary3", None),
        ("pre_salary4", None),
        ("pre_salary5", None),
        ("pre_salary6", None),
        ("pre_salary7", None),

        ("promotion_size", None),
        ("promotion_salary", None),
        ("decrease_size", None),
        ("decrease_salar", None)
    ]
    print "the mapper's param list is %s" % (len(param_list))
    mapper = DataFrameMapper(param_list)
    mapper.fit(data_all)
    return mapper
    def __load_coupons(self, validation_timedelta):
        train_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_train.csv"),
                                           parse_dates=["DISPFROM","DISPEND"])
        test_coupon_df = pd.read_csv(path.join(self.datadir, "coupon_list_test.csv"))

        train_coupon_df["DISPFROM"].fillna(pd.Timestamp("19000101"), inplace=True)
        train_coupon_df = train_coupon_df.sort(columns=["DISPFROM"]).reset_index(drop=True)

        if validation_timedelta:
            max_date = train_coupon_df["DISPFROM"].max()
            valid_start = max_date - validation_timedelta
            valid_coupon_df = train_coupon_df[(train_coupon_df["DISPFROM"] > valid_start)]
            train_coupon_df = train_coupon_df[~ (train_coupon_df["DISPFROM"] > valid_start)]
        else:
            valid_coupon_df = train_coupon_df[np.zeros(len(train_coupon_df), dtype=np.bool)].copy()

        # remove outlier data from the validation-set
        if len(valid_coupon_df) > 0:
            very_low_price = valid_coupon_df[valid_coupon_df.DISCOUNT_PRICE <= 100].COUPON_ID_hash
            very_long_time_display = valid_coupon_df[valid_coupon_df.DISPPERIOD > 20].COUPON_ID_hash
            valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_long_time_display)]
            valid_coupon_df = valid_coupon_df[~valid_coupon_df.COUPON_ID_hash.isin(very_low_price)].reset_index(drop=True)

        # remove outlier data from the training-set
        very_long_time_display = train_coupon_df[train_coupon_df.DISPPERIOD > 20].COUPON_ID_hash
        train_coupon_df = train_coupon_df[~train_coupon_df.COUPON_ID_hash.isin(very_long_time_display)].reset_index(drop=True)

        # coupon features
        coupon_mapper = DataFrameMapper([
                ('CATEGORY_NAME', LabelBinarizer()),
                ('PRICE_RATE', None),
                ('CATALOG_PRICE_LOG', None),
                ('DISCOUNT_PRICE_LOG', None),
                ('REDUCE_PRICE_LOG', None),
                ('DISPPERIOD_C', LabelBinarizer()),
                ('VALIDPERIOD_NA', LabelBinarizer()),
                ('USABLE_DATE_SUM', None),
                ('LARGE_AREA_NAME', LabelBinarizer()),
                ('PREF_NAME', LabelBinarizer()),
                ('SMALL_AREA_NAME', LabelBinarizer()),
                ])
        config = {}
        self.__coupon_preproc(train_coupon_df)
        self.__coupon_preproc(valid_coupon_df)
        self.__coupon_preproc(test_coupon_df)
        
        coupon_mapper.fit(pd.concat([train_coupon_df, valid_coupon_df, test_coupon_df]))
        
        train_coupon_vec = coupon_mapper.transform(train_coupon_df.copy())
        if len(valid_coupon_df) > 0:
            valid_coupon_vec = coupon_mapper.transform(valid_coupon_df.copy())
        else:
            valid_coupon_vec = np.array([])
        test_coupon_vec = coupon_mapper.transform(test_coupon_df.copy())

        self.train_coupon_vec = train_coupon_vec
        self.valid_coupon_vec = valid_coupon_vec
        self.test_coupon_vec = test_coupon_vec
        self.train_coupon_df = train_coupon_df
        self.valid_coupon_df = valid_coupon_df
        self.test_coupon_df = test_coupon_df
class Dataprocess(object):
    datafile = "data.csv"
    def __init__(self, datadir="/Users/shintaro/work/kaggle-kobe/data/"):
        self.datadir = datadir

    def read(self):
        self.df_orig = pd.read_csv(self.datadir + self.datafile)
        self.df = self.df_orig.copy()

    def process(self):
        self.read()
        self.preproc()
        self.set_mapper()
        self.split_df()
        train_X = self.vec_X(self.train_df)
        train_y = self.vec_y(self.train_df)
        test_X = self.mapper_X.transform(self.test_df)
        return train_X, train_y, test_X


    def preproc(self):
        self.df["time_remaining"] = self.df["minutes_remaining"] * 60 + self.df["seconds_remaining"]
        self.df['last_5_sec'] = self.df['time_remaining'] < 5
        self.df['latter_half'] = self.df['time_remaining'] < 360
        self.df['first_period'] = self.df['period'] == 1
        self.df['latter_period'] = self.df['period'] > 2
        self.df['last_period'] = self.df['period'] == 4
        self.df['last_quarter'] = self.df['time_remaining'] < 180

        threshold = 3
        anomaly = 14
        self.df['last_moment'] = self.df.apply(lambda row: row['time_remaining'] < threshold or row['time_remaining'] == anomaly, axis=1)
        self.df['away'] = self.df.matchup.str.contains('@')
        self.df['secondsFromStart'] = 60 * (11 - self.df['minutes_remaining']) + (60 - self.df['seconds_remaining'])
        self.df['secondsFromGameStart'] = (self.df['period'] <= 4).astype(int) * (self.df['period'] - 1) * 12 * 60 + (self.df['period'] > 4).astype(int) * ((self.df['period'] - 4) * 5 * 60 + 3 * 12 * 60) + self.df['secondsFromStart']
        numGaussians = 13
        gaussianMixtureModel = mixture.GMM(n_components=numGaussians, covariance_type='full', 
                                           params='wmc', init_params='wmc',
                                           random_state=1, n_init=3,  verbose=0)
        gaussianMixtureModel.fit(self.df.ix[:,['loc_x','loc_y']])
        self.df['shotLocationCluster'] = gaussianMixtureModel.predict(self.df.ix[:,['loc_x','loc_y']])
        self.df['homeGame'] = self.df['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0)

        self.df["game_year"] = pd.Series([int(self.df["game_date"][i][:4]) for i in range(0, len(self.df))])
        self.df["game_month"] = pd.Series([int(self.df["game_date"][i][5:7]) for i in range(0, len(self.df))])
        self.df["game_day"] = pd.Series([int(self.df["game_date"][i][-2:]) for i in range(0, len(self.df))])

        action_type_list = list(set(self.df["action_type"].tolist()))
        self.df["action_type_num"] = pd.Series([action_type_list.index(self.df["action_type"][i]) for i in range(0, len(self.df))])

        combined_shot_type_list = list(set(self.df["combined_shot_type"].tolist()))
        self.df["combined_shot_type_num"] = pd.Series([combined_shot_type_list.index(self.df["combined_shot_type"][i]) for i in range(0, len(self.df))])

        opponent_list = list(set(self.df["opponent"].tolist()))
        self.df["opponent_num"] = pd.Series([opponent_list.index(self.df["opponent"][i]) for i in range(0, len(self.df))])

        game_id_list = list(set(self.df["game_id"].tolist()))
        self.df["game_id_num"] = pd.Series([game_id_list.index(self.df["game_id"][i]) for i in range(0, len(self.df))])

        season_list = list(set(self.df["season"].tolist()))
        season_list.sort()
        self.df["season_num"] = pd.Series([season_list.index(self.df["season"][i]) for i in range(0, len(self.df))])

        self.df["shot_distance"][self.df["shot_distance"] > 45] = 45

        # del self.df["team_id"], self.df["team_name"], self.df["game_event_id"], self.df["lat"], self.df["lon"]
        # return self.df


    def set_mapper(self):
        self.mapper_X = DataFrameMapper([
            (u'action_type', LabelBinarizer()),
            (u'combined_shot_type', LabelBinarizer()),
            (u'loc_x', None),
            (u'loc_y', None),
            (u'minutes_remaining', None),
            (u'period', LabelBinarizer()),

            (u'playoffs', LabelBinarizer()),
            (u'season', LabelBinarizer()),
            (u'seconds_remaining', None),
            (u'shot_distance', None),
            (u'shot_type', LabelBinarizer()),
            (u'shot_zone_area', LabelBinarizer()),
            (u'shot_zone_basic', LabelBinarizer()),
            (u'shot_zone_range', LabelBinarizer()),
            (u'matchup', LabelBinarizer()),
            (u'shot_id', None),

            (u'season_num', None),
            (u'game_year', None),
            (u'game_month', None),
            (u'game_day', None),

            (u'first_period', LabelBinarizer()),
            (u'latter_period', LabelBinarizer()),
            (u'last_period', LabelBinarizer()),
            (u'last_quarter', LabelBinarizer()),
            (u'time_remaining', None),
            (u'latter_half', LabelBinarizer()),
            (u'last_5_sec', LabelBinarizer()),
            (u'opponent_num', LabelBinarizer()),
            (u'game_id_num', LabelBinarizer()),

            (u'last_moment', LabelBinarizer()),
            (u'away', LabelBinarizer()),
            (u'secondsFromStart', None),
            (u'secondsFromGameStart', None),
            (u'shotLocationCluster', LabelBinarizer()),
            (u'homeGame', LabelBinarizer()),
            ])
        self.mapper_y = DataFrameMapper([(u'shot_made_flag', None),])
        self.mapper_X.fit(self.df)
        self.mapper_y.fit(self.df)


    def split_df(self):
        self.train_df = self.df[~np.isnan(self.df["shot_made_flag"])]
        self.test_df = self.df[np.isnan(self.df["shot_made_flag"])]


    def vec_X(self, df):
        return self.mapper_X.transform(df.copy())


    def vec_y(self, df):
        return self.mapper_y.transform(df.copy())
Ejemplo n.º 44
0
y_test=test.Gap_cnt.fillna(0)
x=train[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_test=test[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_result=predict[feasible_columns].drop(columns_to_drop, axis=1).fillna(0)
x_result=x_result.loc[[i in [46,58,70,82,94,106,118,130,142] for i in x_result.time_id],:]
x_final=x_result.reset_index(drop=True)

a=x.columns
mapper=[]
for j in a:
	if j in ['district_id', 'Day', 'Weekday', 'Workday', 'Yesterday_Workday','Twoday_ago_Workday', 'time_id']:
		mapper.append((j,None))
	else:
		mapper.append((j,StandardScaler()))
b=DataFrameMapper(mapper)
b.fit(pd.concat([x, x_test, x_result]))
x=b.transform(x)
x_test=b.transform(x_test)
x_result_before = x_result
x_result=b.transform(x_result)

#Random Forest
clf = ensemble.RandomForestClassifier(n_estimators=20,max_features=min(len(feasible_columns) - len(columns_to_drop), 25))
clf.fit(x,y)
clf_predict=clf.predict(x_test)
clf_score=clf.score(x_test, y_test)


clf_predict.fill(1)

diff=clf_predict-y_test