Ejemplo n.º 1
0
    def _engineered_features(self):
        # @abhishek's features
        # Thanks to @raddar and @abhishek for the data.
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284

        abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding="ISO-8859-1")
        abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding="ISO-8859-1")

        abhishek_train_features = abhishek_train.ix[:404176, 9:30]\
                                                .replace([np.inf,-np.inf],0)\
                                                .drop('jaccard_distance', axis=1)\
                                                .drop('euclidean_distance', axis=1)

        abhishek_test_features = abhishek_test.ix[:, 9:30]\
                                              .replace([np.inf,-np.inf],0)\
                                              .drop('jaccard_distance', axis=1)\
                                              .drop('euclidean_distance', axis=1)
        # Krzysztof Dziedzic's magic feature II.
        # Data by @Justfor.
        # See https://www.kaggle.com/justfor/edges/code
        # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287

        magic2_train_features = pd.read_csv(MAGIC_II_TRAIN, encoding="utf-8")

        magic2_test_features = pd.read_csv(MAGIC_II_TEST, encoding="utf-8")

        # @tarobxl kcore feature
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371
        from kcore_decomposition import KCore_Decomposition
        kd = KCore_Decomposition(train_data_filename=TRAIN_DATA_FILENAME)
        kcore_train_features, kcore_test_features = kd.attach_max_kcore()

        # @jturkewitz's magic feature
        # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain
        magic_train_features = pd.read_csv(MAGIC_TRAIN, encoding="utf-8")
        magic_train_features = magic_train_features.ix[:, 3:5]

        magic_test_features = pd.read_csv(MAGIC_TEST, encoding="utf-8")
        magic_test_features = magic_test_features.ix[:, 3:5]

        custom_train_features = pd.read_csv(CUSTOM_FEATURES_TRAIN,
                                            encoding="utf-8")
        custom_test_features = pd.read_csv(CUSTOM_FEATURES_TEST,
                                           encoding="utf-8")

        train_features = pd.concat([
            custom_train_features, abhishek_train_features,
            magic_train_features, magic2_train_features, kcore_train_features
        ],
                                   axis=1,
                                   join='inner').fillna(0)
        test_features = pd.concat([
            custom_test_features, abhishek_test_features, magic_test_features,
            magic2_test_features, kcore_test_features
        ],
                                  axis=1,
                                  join='inner').fillna(0)

        return (train_features, test_features)
Ejemplo n.º 2
0
    def _engineered_features(self):
        # @abhishek's features
        # Thanks to @raddar and @abhishek for the data.
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284

        # abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding = "ISO-8859-1")
        # abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding = "ISO-8859-1")
        # abhishek_train_features = abhishek_train.ix[:404176, 9:30].replace([np.inf,
        #                                                                   -np.inf],
        #                                                                  0)
        # abhishek_test_features = abhishek_test.ix[:, 9:30].replace([np.inf,
        #                                                                   -np.inf],
        #                                                                  0)
        
        # Krzysztof Dziedzic's magic feature II.
        # Data by @Justfor.
        # See https://www.kaggle.com/justfor/edges/code
        # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287

        magic2_train_features =  pd.read_csv(MAGIC_II_TRAIN,
                                             encoding = "utf-8")


        magic2_test_features =  pd.read_csv(MAGIC_II_TEST,
                                            encoding = "utf-8")

        # @tarobxl kcore feature
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371
        from kcore_decomposition import KCore_Decomposition
        kd = KCore_Decomposition()
        kcore_train_features, kcore_test_features = kd.attach_max_kcore()

        # @jturkewitz's magic feature
        # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain
        magic_train_features =  pd.read_csv(MAGIC_TRAIN,
                                             encoding = "utf-8")
        magic_train_features = magic_train_features.ix[:, 3:5]

        magic_test_features =  pd.read_csv(MAGIC_TEST,
                                            encoding = "utf-8")
        magic_test_features = magic_test_features.ix[:, 3:5]

        train_features = pd.concat([#abhishek_train_features,
                                    magic_train_features,
                                    magic2_train_features,
                                    kcore_train_features], axis=1, join='inner').fillna(0)
        test_features = pd.concat([#abhishek_test_features,
                                    magic_test_features,                                   
                                    magic2_test_features,
                                    kcore_test_features], axis=1, join='inner').fillna(0)

        return (train_features, test_features)
Ejemplo n.º 3
0
    def _engineered_features(self):
        # @abhishek's features
        # Thanks to @raddar and @abhishek for the data.
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284

        abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding="ISO-8859-1")
        abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding="ISO-8859-1")

        abhishek_train_features = abhishek_train.ix[:, 9:30]\
                                                .replace([np.inf,-np.inf],0)\

        abhishek_test_features = abhishek_test.ix[:, 9:30]\
                                              .replace([np.inf,-np.inf],0)\

        # Krzysztof Dziedzic's magic feature II.
        # Data by @Justfor.
        # See https://www.kaggle.com/justfor/edges/code
        # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287

        magic2_train_features = pd.read_csv(MAGIC_II_TRAIN, encoding="utf-8")

        magic2_test_features = pd.read_csv(MAGIC_II_TEST, encoding="utf-8")

        # @tarobxl kcore feature
        # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371
        from kcore_decomposition import KCore_Decomposition
        kd = KCore_Decomposition(train_data_filename=TRAIN_DATA_FILENAME)
        kcore_train_features, kcore_test_features = kd.attach_max_kcore()

        # @jturkewitz's magic feature
        # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain
        magic_train_features = pd.read_csv(MAGIC_TRAIN, encoding="utf-8")
        magic_train_features = magic_train_features.ix[:, 3:5]

        magic_test_features = pd.read_csv(MAGIC_TEST, encoding="utf-8")
        magic_test_features = magic_test_features.ix[:, 3:5]

        custom_train_features = pd.read_csv(CUSTOM_FEATURES_TRAIN,
                                            encoding="utf-8")
        custom_test_features = pd.read_csv(CUSTOM_FEATURES_TEST,
                                           encoding="utf-8")

        nltk_train_features = pd.read_csv(NLTK_FEATURES_TRAIN,
                                          encoding="utf-8")
        nltk_train_features = nltk_train_features[[
            "hypernyms_share", "lemmas_share"
        ]]
        nltk_test_features = pd.read_csv(NLTK_FEATURES_TEST, encoding="utf-8")
        nltk_test_features = nltk_test_features[[
            "hypernyms_share", "lemmas_share"
        ]]

        wordies_train_features = pd.read_csv(WORDIES_FEATURES_TRAIN,
                                             encoding="utf-8")
        wordies_test_features = pd.read_csv(WORDIES_FEATURES_TEST,
                                            encoding="utf-8")

        counts_train_features = pd.read_csv(COUNTS_FEATURES_TRAIN,
                                            encoding="utf-8")[[
                                                'stems_freq', 'stems_share',
                                                'stems_weighted_difference',
                                                'stems_tversky_index'
                                            ]]
        counts_test_features = pd.read_csv(COUNTS_FEATURES_TEST,
                                           encoding="utf-8")[[
                                               'stems_freq', 'stems_share',
                                               'stems_weighted_difference',
                                               'stems_tversky_index'
                                           ]]
        env_train_features = pd.read_csv(ENV_FEATURES_TRAIN,
                                         encoding="utf-8")\
                               .fillna(0)[["string_similarity",
                                           "kendall_p_value"]]
        env_test_features = pd.read_csv(ENV_FEATURES_TEST,
                                        encoding="utf-8")\
                              .fillna(0)[["string_similarity",
                                          "kendall_p_value"]]

        train_features = pd.concat([
            custom_train_features, counts_train_features, env_train_features,
            nltk_train_features, wordies_train_features,
            abhishek_train_features, magic2_train_features,
            kcore_train_features, magic_train_features
        ],
                                   axis=1,
                                   join='inner').fillna(0)

        test_features = pd.concat([
            custom_test_features, counts_test_features, env_test_features,
            nltk_test_features, wordies_test_features, abhishek_test_features,
            magic2_test_features, kcore_test_features, magic_test_features
        ],
                                  axis=1,
                                  join='inner').fillna(0)

        return (train_features, test_features)