forked from andylin512/kaggle_quora
/
gen_feat.py
182 lines (161 loc) · 8.06 KB
/
gen_feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from nlp_utils import stopwords, english_stemmer, stem_tokens, \
compute_dist, try_divide, cat_text, getTFV, getBOW, cosine_sim
from sklearn.decomposition import TruncatedSVD
from param_config import config
import cPickle
import ngram
import numpy as np
import pandas as pd
import sys
from scipy.sparse import vstack
reload(sys)
sys.setdefaultencoding('utf8')
svd_n_components = [100, 50]
# Tokenize and stem the data
def preprocess_data(line, exclude_stopword=True):
## tokenize
tokens = [x.lower() for x in str(line).split()]
## stem
tokens_stemmed = stem_tokens(tokens, english_stemmer)
if exclude_stopword:
tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]
return tokens_stemmed
# Generate ngram data (n = 1, 2, 3)
def gen_ngram_data(df):
## unigram
print("generate unigram")
df["q1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
df["q2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
## bigram
print("generate bigram")
join_str = "_"
df["q1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q1_unigram"], join_str), axis=1))
df["q2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q2_unigram"], join_str), axis=1))
## trigram
print("generate trigram")
join_str = "_"
df["q1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q1_bigram"], join_str), axis=1))
df["q2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q2_bigram"], join_str), axis=1))
return df
# Extract counting features based on ngram generated
def extract_counting_feat(df):
feat_names = ["q1", "q2"]
grams = ["unigram", "bigram", "trigram"]
count_digit = lambda x: sum([1. for w in x if w.isdigit()])
################################
## word count and digit count ##
################################
print("generate basic counting features...")
for feat_name in feat_names:
for gram in grams:
## word count
df["count_of_%s_%s" % (feat_name, gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
df["count_of_unique_%s_%s" % (feat_name, gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
df["ratio_of_unique_%s_%s" % (feat_name, gram)] = map(try_divide, df["count_of_unique_%s_%s" % (feat_name, gram)], df["count_of_%s_%s" % (feat_name, gram)])
## digit count
df["count_of_digit_in_%s" % feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
df["ratio_of_digit_in_%s" % feat_name] = map(try_divide, df["count_of_digit_in_%s" % feat_name], df["count_of_%s_unigram" % feat_name])
#########################
## interact word count ##
#########################
print("generate interact counting features...")
for gram in grams:
for obs_name in feat_names:
for target_name in feat_names:
if target_name != obs_name:
# shared words
df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = list(df.apply(
lambda x: sum([1. for w in x[obs_name + "_" + gram] if w in set(x[target_name + "_" + gram])]), axis=1))
df["ratio_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = map(try_divide,
df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
return df
# Extract distance features based on ngram generated
def extract_distance_feat(df):
## jaccard coef/dice dist of n-gram
print "generate jaccard coef and dice dist for n-gram"
dists = ["jaccard_coef", "dice_dist"]
grams = ["unigram", "bigram", "trigram"]
feat_names = ["q1", "q2"]
for dist in dists:
for gram in grams:
for i in range(len(feat_names) - 1):
for j in range(i + 1, len(feat_names)):
target_name = feat_names[i]
obs_name = feat_names[j]
df["%s_of_%s_between_%s_%s" % (dist, gram, target_name, obs_name)] = \
list(df.apply(
lambda x: compute_dist(x[target_name + "_" + gram], x[obs_name + "_" + gram], dist), axis=1))
return df
# Extract tfidf features based on ngram generated
def extract_tfidf_feat(df):
df["all_text"] = list(df.apply(cat_text, axis=1))
vec_types = ["tfidf", "bow"]
feat_names = ["question1", "question2"]
for vec_type in vec_types:
if vec_type == "tfidf":
vec = getTFV(ngram_range=(1,3))
elif vec_type == "bow":
vec = getBOW(ngram_range=(1,3))
# get common vocabulary
vec.fit(df["all_text"])
vocabulary = vec.vocabulary_
print("generate ngram %s feat for %s" % (vec_type, feat_names[0]))
if vec_type == "tfidf":
vec = getTFV(ngram_range=(1, 3), vocabulary=vocabulary)
elif vec_type == "bow":
vec = getBOW(ngram_range=(1, 3), vocabulary=vocabulary)
# fit common vocabulary on each specific question
q1_vec = vec.fit_transform(df[feat_names[0]])
# with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[0], vec_type), "wb") as f:
# cPickle.dump(q1_vec, f, -1)
q2_vec = vec.fit_transform(df[feat_names[1]])
# with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[1], vec_type), "wb") as f:
# cPickle.dump(q2_vec, f, -1)
print("q1_vec has shape: %s, while q2_vec has shape: %s" % (q1_vec.shape, q2_vec.shape))
# calculate Cos distance of these 2 vecs
print("generate common %s cosine sim feat for q1 and q2" % vec_type)
df["%s_cos_of_q1_q2" % vec_type] = np.asarray(map(cosine_sim, q1_vec, q2_vec))[:, np.newaxis]
# calculate SVD Cos distance of these 2 vecs
# print("generate svd %s cosine sim feat for q1 and q2" % vec_type)
# vertically stack q1 and q2
# q1_q2_vec = vstack([q1_vec, q2_vec])
# for n_components in svd_n_components:
# svd = TruncatedSVD(n_components=n_components, n_iter=15)
# svd.fit(q1_q2_vec)
# q1_svd_vec = svd.transform(q1_vec)
# q2_svd_vec = svd.transform(q2_vec)
# print("q1_svd_vec has shape: %s, while q2_svd_vec has shape: %s" % (q1_svd_vec.shape, q2_svd_vec.shape))
# df["svd%s_%s_cos_of_q1_q2" % (n_components, vec_type)] = np.asarray(map(cosine_sim, q1_svd_vec, q2_svd_vec))[:, np.newaxis]
return df
'''
if __name__ == "__main__":
dfTrain_path = "%s/train.csv" % config.data_path
dfTest_path = "%s/test.csv" % config.data_path
dfTrain = pd.read_csv(dfTrain_path)
dfTest = pd.read_csv(dfTest_path)
print(dfTrain.head())
# Generate ngram = 1, 2, 3
dfTrain = gen_ngram_data(dfTrain)
# dfTest = gen_ngram_data(dfTest)
# Extract counting features
dfTrain_count = extract_counting_feat(dfTrain)
# dfTest_count = extract_counting_feat(dfTest)
with open("%s/train.gen.count.pkl" % config.processed_data_path, "wb") as f:
cPickle.dump(dfTrain_count, f, -1)
# with open("%s/test.gen.count.pkl" % config.processed_data_path, "wb") as f:
# cPickle.dump(dfTest_count, f, -1)
print("Dumped counting features to df...")
print(dfTrain_count.head())
# Extract distance features
dfTrain_dist = extract_distance_feat(dfTrain_count)
with open("%s/train.gen.dist.pkl" % config.processed_data_path, "wb") as f:
cPickle.dump(dfTrain_dist, f, -1)
# with open("%s/test.gen.dist.pkl" % config.processed_data_path, "wb") as f:
# cPickle.dump(dfTest_dist, f, -1)
print("Dumped distance features to df...")
# Extract tfidf features
dfTrain_dist_tfidf = extract_tfidf_feat(dfTrain_dist)
with open("%s/train.gen.tfidf.pkl" % config.processed_data_path, "wb") as f:
cPickle.dump(dfTrain_dist_tfidf, f, -1)
print(dfTrain_dist_tfidf.head())
'''