Esempio n. 1
0
    def str2vec(self, string):
        vector = list()
        string = templatize_sql(string)
        for item in string.strip().split():
            if item in self.model:
                vector.extend(self.model[item])
            else:
                vector.extend([0.0] * self.params.get('size'))

        if len(vector) >= self.max_len:
            del vector[self.max_len:]
        else:
            vector.extend([0.0] * (self.max_len - len(vector)))

        return vector
Esempio n. 2
0
    def fit(self, data):
        self.build_word2vector(data)
        list_vec = []
        list_cost = []
        for sql, duration_time in self.data:
            if check_illegal_sql(sql):
                continue
            filter_template = templatize_sql(sql)
            vector = self.w2v.str2vec(filter_template)
            list_vec.append(vector)
            list_cost.append(duration_time)

        features = np.array(list_vec)
        labels = np.array(list_cost)

        labels = labels.reshape(-1, 1)
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.scaler.fit(labels)
        labels = self.scaler.transform(labels)
        self.regression.fit(features, labels, epochs=self.epoch)
Esempio n. 3
0
    def transform(self, data):

        feature_list = []
        data_backup = list(data)
        error_list = []
        for idx_error, sql in enumerate(data_backup):
            if check_illegal_sql(sql):
                error_list.append(idx_error)
                continue
            filter_template = templatize_sql(sql)
            vector = self.w2v.str2vec(filter_template)
            feature_list.append(vector)

        features = np.array(feature_list)
        predictions = self.regression.predict(features)
        predictions = np.abs(predictions)
        score = self.scaler.inverse_transform(predictions)
        if error_list:
            for item in error_list:
                score = np.insert(score, item, -1)
        score = np.hstack(
            (np.array(data_backup).reshape(-1, 1), score.reshape(-1,
                                                                 1))).tolist()
        return score
Esempio n. 4
0
 def __iter__(self):
     for sql, _ in self.data:
         yield templatize_sql(sql).split()
def mapper_function(value):
    query = templatize_sql(value[0])
    execution_time = float(value[1]) / 1000000
    return (query, execution_time)