def str2vec(self, string): vector = list() string = templatize_sql(string) for item in string.strip().split(): if item in self.model: vector.extend(self.model[item]) else: vector.extend([0.0] * self.params.get('size')) if len(vector) >= self.max_len: del vector[self.max_len:] else: vector.extend([0.0] * (self.max_len - len(vector))) return vector
def fit(self, data): self.build_word2vector(data) list_vec = [] list_cost = [] for sql, duration_time in self.data: if check_illegal_sql(sql): continue filter_template = templatize_sql(sql) vector = self.w2v.str2vec(filter_template) list_vec.append(vector) list_cost.append(duration_time) features = np.array(list_vec) labels = np.array(list_cost) labels = labels.reshape(-1, 1) self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler.fit(labels) labels = self.scaler.transform(labels) self.regression.fit(features, labels, epochs=self.epoch)
def transform(self, data): feature_list = [] data_backup = list(data) error_list = [] for idx_error, sql in enumerate(data_backup): if check_illegal_sql(sql): error_list.append(idx_error) continue filter_template = templatize_sql(sql) vector = self.w2v.str2vec(filter_template) feature_list.append(vector) features = np.array(feature_list) predictions = self.regression.predict(features) predictions = np.abs(predictions) score = self.scaler.inverse_transform(predictions) if error_list: for item in error_list: score = np.insert(score, item, -1) score = np.hstack( (np.array(data_backup).reshape(-1, 1), score.reshape(-1, 1))).tolist() return score
def __iter__(self): for sql, _ in self.data: yield templatize_sql(sql).split()
def mapper_function(value): query = templatize_sql(value[0]) execution_time = float(value[1]) / 1000000 return (query, execution_time)