class ModelGenerator(object):
    def __init__(self,
                 load_pretrain_emb=False,
                 data_feature=None,
                 meta_data_feature=None,
                 fasttext_embeddings_index=None,
                 multi_label=False):

        self.data_feature = data_feature
        self.load_pretrain_emb = load_pretrain_emb
        self.meta_data_feature = meta_data_feature
        self.oov_cnt = 0
        self.embedding_matrix = None
        self.use_bpe = False
        self.multi_label = multi_label
        self.lr = 0.001
        self.cur_lr = 0.0
        self.emb_size = 300
        self.fasttext_embeddings_index = fasttext_embeddings_index

        self.model_lib = {
            'text_cnn': TextCNN_Model,
            'text_cnn_2d': CNN_Model,
            'text_rcnn': TextRCNN_Model,
            'text_rnn': RNN_Model
        }

        self.feature_lib = {
            'char-level + 64dim-embedding', 'char-level + 300dim-embedding',
            'word-level + pretrained embedding300dim',
            'word-level + 64dim-embedding'
        }

    def select_classifier(self, model_name, feature_mode, data_feature):
        _feature = {}
        if feature_mode == 'char-level + 64dim-embedding':
            _feature = {'use_fasttext_emb': False, 'emb_size': 64}
        elif feature_mode == 'char-level + 300dim-embedding':
            _feature = {'use_fasttext_emb': False, 'emb_size': 300}
        elif feature_mode == 'word-level + pretrained embedding300dim':
            _feature = {'use_fasttext_emb': True, 'emb_size': 300}
        elif feature_mode == 'word-level + 64dim-embedding':
            _feature = {'use_fasttext_emb': False, 'emb_size': 64}

        data_feature.update(_feature)
        model = self.build_model(model_name, data_feature=data_feature)
        return model

    def _set_model_compile_params(self, optimizer_name, lr, metrics=[]):
        optimizer = self._set_optimizer(optimizer_name=optimizer_name, lr=lr)
        loss_fn = self._set_loss_fn()
        print(loss_fn)
        if metrics:
            metrics = metrics
        else:
            metrics = ['accuracy']

        return optimizer, loss_fn, metrics

    def _set_model_train_params(self):
        pass

    def build_model(self, model_name, data_feature):
        if model_name == 'svm':
            model = LinearSVC(random_state=0, tol=1e-5, max_iter=500)
            self.model = CalibratedClassifierCV(model)
            if self.multi_label:
                info("use OneVsRestClassifier")
                self.model = OneVsRestClassifier(self.model, n_jobs=-1)

        else:
            if data_feature["use_fasttext_emb"]:
                self.oov_cnt, self.embedding_matrix = self.generate_emb_matrix(
                    num_features=data_feature["num_features"],
                    word_index=data_feature["word_index"])
            else:
                self.embedding_matrix = None

            self.emb_size = data_feature["emb_size"]

            kwargs = {
                'embedding_matrix': self.embedding_matrix,
                'input_shape': data_feature['input_shape'],
                'max_length': data_feature['max_length'],
                'num_features': data_feature['num_features'],
                'num_classes': data_feature['num_class'],
                "filter_num": data_feature["filter_num"],
                "trainable": False,
                "emb_size": self.emb_size
            }
            if self.multi_label:
                kwargs["use_multi_label"] = True

            self.model = self.model_lib[model_name](**kwargs)
            self._set_init_lr(model_name)
            optimizer, loss_fn, metrics = self._set_model_compile_params(
                optimizer_name='RMSprop', lr=self.lr)
            if self.multi_label:
                loss_fn = 'binary_crossentropy'

            self.model.compile(loss=loss_fn,
                               optimizer=optimizer,
                               metrics=metrics)

        return self.model

    def _set_loss_fn(self):
        loss_fn = 'categorical_crossentropy'
        return loss_fn

    def _set_optimizer(self, optimizer_name, lr=0.001):
        if optimizer_name == 'RAdam':
            opt = RAdam(learning_rate=lr)
        elif optimizer_name == 'RMSprop':
            opt = RMSprop(lr=lr)
        elif optimizer_name == "Adam":
            opt = Adam(lr=lr)
        return opt

    def _set_init_lr(self, model_name):
        if model_name == "text_cnn":
            self.lr = 0.001
        elif model_name == "text_cnn_2d":
            self.lr = 0.016
        elif model_name == "text_rcnn":
            self.lr = 0.025
        elif model_name == "text_rnn":
            self.lr = 0.0035

    def model_pre_select(self, model_name="svm"):
        self.model_name = model_name

    def generate_emb_matrix(self, num_features, word_index):
        return generate_emb_matrix(
            num_features=num_features,
            word_index=word_index,
            fasttext_embeddings_index=self.fasttext_embeddings_index)
Ejemplo n.º 2
0
class ClassifierSimilarity():

    def __init__(self, model = None, num_pca_components = 6, max_error = 3, min_matches = 3):
        from sklearn.multiclass import OneVsRestClassifier
        if model is None:
            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression()
        self.model = OneVsRestClassifier(model)
        self.num_pca_components = num_pca_components
        self.max_error = max_error
        self.min_matches = min_matches

    def get_similarity(self, data, similarity = None):
        features = self.get_input_features(data)
        features = (features - features.mean(axis=0))/features.std(axis=0)
        features[:, 0] = 0
        true_matches = self.get_true_matches(data)
        predicted_matches = np.zeros(true_matches.shape)
        tsim_scores = TsimModel().get_similarity(data)
        for p in range(true_matches.shape[0]):
            train_features = np.delete(features, p, axis = 0)
            train_matches = np.delete(true_matches, p, axis = 0)
            predict_features = features[p,:]
            for p2 in range(true_matches.shape[1]):
                if p == p2:
                    continue
                train_features[: , 0] = np.delete(tsim_scores[:, p2], p, axis = 0)
                predict_features[0] = tsim_scores[p,p2]
                y = train_matches[:, p2].reshape(-1,1)
                print(y.shape, ' ', train_features.shape)
                self.model.fit(train_features , y)
                print(self.model.predict_proba(predict_features).shape)
                predicted_matches[p, p2] = self.model.predict_proba(predict_features)
#            predicted_matches[p, p] = 0
            print(len(np.where(predicted_matches[p,:] > .4)[0]))
        return predicted_matches

    def get_true_matches(self, data):
        dose_error = self.get_match_error(data)
        match_matrix = np.zeros(dose_error.shape)
        n_patients = data.get_num_patients()
        for p in range(n_patients):
            errors = dose_error[p, :]
            matches = []
            max_error = self.max_error
            while len(matches) < self.min_matches:
                matches = np.where(errors < max_error)[0]
                max_error = max_error + .2
            match_matrix[p, matches] = 1
        return match_matrix

    def get_match_error(self, data):
        n_patients = data.get_num_patients()
        doses = data.doses
        error_matrix = np.zeros((n_patients, n_patients))
        for p1 in range(n_patients):
            for p2 in range(p1 + 1, n_patients):
                dose_difference = np.abs(doses[p1,:] - doses[p2, :])
                error_matrix[p1, p2] = np.mean(dose_difference)
        error_matrix += error_matrix.transpose()
        return error_matrix

    def get_input_features(self, data):
        num_patients = data.get_num_patients()
        pca = lambda x: Metrics.pca(x, self.num_pca_components)
        distances = pca(data.tumor_distances)
        lymph_nodes = pca(data.lymph_nodes)
        tumor_volumes = np.zeros((num_patients, 2))
        for i in range(num_patients):
            gtvs = data.gtvs[i]
            gtvp_volume = gtvs[0].volume
            gtvn_volume = 0
            for gtvn in gtvs[1:]:
                gtvn_volume += gtvn.volume
            tumor_volumes[i, :] = (gtvp_volume, gtvn_volume)
        laterality = data.lateralities.reshape(num_patients, 1)
        laterality = np.vectorize(TreeSimilarity.laterality_map.__getitem__)(laterality)
        subsites = data.subsites.reshape(num_patients, 1)
        subsites = np.vectorize(TreeSimilarity.subsite_map.__getitem__)(subsites)
        total_doses = data.prescribed_doses.reshape(num_patients, 1)
        clusters = data.classes.reshape(num_patients, 1)
        features = np.hstack([distances, lymph_nodes, tumor_volumes, total_doses, subsites])
        return features

##old code from testing neural nets
        def get_autoencoderish_model(features):
    input_x = Input(shape=(features.shape[1],))
    encoder = Sequential([
            Dense(45, input_dim=features.shape[1], activation = 'relu'),
            Dense(100, activation = 'relu'),
            Dense(100, activation = 'relu'),
            Dense(100, activation = 'relu'),
            ])(input_x)

    decoder = Sequential([
            Dense(100,input_dim = 4, activation = 'relu',
                  activity_regularizer = regularizers.l2(.01)),
            Dense(45, activation = 'relu'),
            ])(encoder)
    model = Model(input_x, decoder)
    encoder_model= Model(input_x, encoder)
#    optimizer = optimizers.SGD(lr = .01, decay = 1e-12, momentum = .1)
    optimizer = optimizers.Adam()
    model.compile(loss = losses.mean_absolute_error,
                  optimizer = optimizer)
    return(model, encoder_model)