def _engineered_features(self): # @abhishek's features # Thanks to @raddar and @abhishek for the data. # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284 abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding="ISO-8859-1") abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding="ISO-8859-1") abhishek_train_features = abhishek_train.ix[:404176, 9:30]\ .replace([np.inf,-np.inf],0)\ .drop('jaccard_distance', axis=1)\ .drop('euclidean_distance', axis=1) abhishek_test_features = abhishek_test.ix[:, 9:30]\ .replace([np.inf,-np.inf],0)\ .drop('jaccard_distance', axis=1)\ .drop('euclidean_distance', axis=1) # Krzysztof Dziedzic's magic feature II. # Data by @Justfor. # See https://www.kaggle.com/justfor/edges/code # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287 magic2_train_features = pd.read_csv(MAGIC_II_TRAIN, encoding="utf-8") magic2_test_features = pd.read_csv(MAGIC_II_TEST, encoding="utf-8") # @tarobxl kcore feature # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371 from kcore_decomposition import KCore_Decomposition kd = KCore_Decomposition(train_data_filename=TRAIN_DATA_FILENAME) kcore_train_features, kcore_test_features = kd.attach_max_kcore() # @jturkewitz's magic feature # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain magic_train_features = pd.read_csv(MAGIC_TRAIN, encoding="utf-8") magic_train_features = magic_train_features.ix[:, 3:5] magic_test_features = pd.read_csv(MAGIC_TEST, encoding="utf-8") magic_test_features = magic_test_features.ix[:, 3:5] custom_train_features = pd.read_csv(CUSTOM_FEATURES_TRAIN, encoding="utf-8") custom_test_features = pd.read_csv(CUSTOM_FEATURES_TEST, encoding="utf-8") train_features = pd.concat([ custom_train_features, abhishek_train_features, magic_train_features, magic2_train_features, kcore_train_features ], axis=1, join='inner').fillna(0) test_features = pd.concat([ custom_test_features, abhishek_test_features, magic_test_features, magic2_test_features, kcore_test_features ], axis=1, join='inner').fillna(0) return (train_features, test_features)
def _engineered_features(self): # @abhishek's features # Thanks to @raddar and @abhishek for the data. # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284 # abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding = "ISO-8859-1") # abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding = "ISO-8859-1") # abhishek_train_features = abhishek_train.ix[:404176, 9:30].replace([np.inf, # -np.inf], # 0) # abhishek_test_features = abhishek_test.ix[:, 9:30].replace([np.inf, # -np.inf], # 0) # Krzysztof Dziedzic's magic feature II. # Data by @Justfor. # See https://www.kaggle.com/justfor/edges/code # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287 magic2_train_features = pd.read_csv(MAGIC_II_TRAIN, encoding = "utf-8") magic2_test_features = pd.read_csv(MAGIC_II_TEST, encoding = "utf-8") # @tarobxl kcore feature # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371 from kcore_decomposition import KCore_Decomposition kd = KCore_Decomposition() kcore_train_features, kcore_test_features = kd.attach_max_kcore() # @jturkewitz's magic feature # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain magic_train_features = pd.read_csv(MAGIC_TRAIN, encoding = "utf-8") magic_train_features = magic_train_features.ix[:, 3:5] magic_test_features = pd.read_csv(MAGIC_TEST, encoding = "utf-8") magic_test_features = magic_test_features.ix[:, 3:5] train_features = pd.concat([#abhishek_train_features, magic_train_features, magic2_train_features, kcore_train_features], axis=1, join='inner').fillna(0) test_features = pd.concat([#abhishek_test_features, magic_test_features, magic2_test_features, kcore_test_features], axis=1, join='inner').fillna(0) return (train_features, test_features)
def _engineered_features(self): # @abhishek's features # Thanks to @raddar and @abhishek for the data. # See https://www.kaggle.com/c/quora-question-pairs/discussion/31284 abhishek_train = pd.read_csv(ABHISHEK_TRAIN, encoding="ISO-8859-1") abhishek_test = pd.read_csv(ABHISHEK_TEST, encoding="ISO-8859-1") abhishek_train_features = abhishek_train.ix[:, 9:30]\ .replace([np.inf,-np.inf],0)\ abhishek_test_features = abhishek_test.ix[:, 9:30]\ .replace([np.inf,-np.inf],0)\ # Krzysztof Dziedzic's magic feature II. # Data by @Justfor. # See https://www.kaggle.com/justfor/edges/code # and https://www.kaggle.com/c/quora-question-pairs/discussion/33287 magic2_train_features = pd.read_csv(MAGIC_II_TRAIN, encoding="utf-8") magic2_test_features = pd.read_csv(MAGIC_II_TEST, encoding="utf-8") # @tarobxl kcore feature # See https://www.kaggle.com/c/quora-question-pairs/discussion/33371 from kcore_decomposition import KCore_Decomposition kd = KCore_Decomposition(train_data_filename=TRAIN_DATA_FILENAME) kcore_train_features, kcore_test_features = kd.attach_max_kcore() # @jturkewitz's magic feature # See https://www.kaggle.com/jturkewitz/magic-features-0-03-gain magic_train_features = pd.read_csv(MAGIC_TRAIN, encoding="utf-8") magic_train_features = magic_train_features.ix[:, 3:5] magic_test_features = pd.read_csv(MAGIC_TEST, encoding="utf-8") magic_test_features = magic_test_features.ix[:, 3:5] custom_train_features = pd.read_csv(CUSTOM_FEATURES_TRAIN, encoding="utf-8") custom_test_features = pd.read_csv(CUSTOM_FEATURES_TEST, encoding="utf-8") nltk_train_features = pd.read_csv(NLTK_FEATURES_TRAIN, encoding="utf-8") nltk_train_features = nltk_train_features[[ "hypernyms_share", "lemmas_share" ]] nltk_test_features = pd.read_csv(NLTK_FEATURES_TEST, encoding="utf-8") nltk_test_features = nltk_test_features[[ "hypernyms_share", "lemmas_share" ]] wordies_train_features = pd.read_csv(WORDIES_FEATURES_TRAIN, encoding="utf-8") wordies_test_features = pd.read_csv(WORDIES_FEATURES_TEST, encoding="utf-8") counts_train_features = pd.read_csv(COUNTS_FEATURES_TRAIN, encoding="utf-8")[[ 'stems_freq', 'stems_share', 'stems_weighted_difference', 'stems_tversky_index' ]] counts_test_features = pd.read_csv(COUNTS_FEATURES_TEST, encoding="utf-8")[[ 'stems_freq', 'stems_share', 'stems_weighted_difference', 'stems_tversky_index' ]] env_train_features = pd.read_csv(ENV_FEATURES_TRAIN, encoding="utf-8")\ .fillna(0)[["string_similarity", "kendall_p_value"]] env_test_features = pd.read_csv(ENV_FEATURES_TEST, encoding="utf-8")\ .fillna(0)[["string_similarity", "kendall_p_value"]] train_features = pd.concat([ custom_train_features, counts_train_features, env_train_features, nltk_train_features, wordies_train_features, abhishek_train_features, magic2_train_features, kcore_train_features, magic_train_features ], axis=1, join='inner').fillna(0) test_features = pd.concat([ custom_test_features, counts_test_features, env_test_features, nltk_test_features, wordies_test_features, abhishek_test_features, magic2_test_features, kcore_test_features, magic_test_features ], axis=1, join='inner').fillna(0) return (train_features, test_features)