Ejemplo n.º 1
0
class CoruscantModel:
    type_pretrained = None
    data_root = None
    list_files = None
    model_dir = None

    tokenizer_pretrained_coruscant = None
    coruscant_vocab = None
    coruscant_tokenizer = None

    # data bunch
    data_bunch = None
    batch_size = None

    # data to feed the model
    train = None
    test = None
    val = None

    # model
    bert_model_class = None
    loss_func = None
    acc_02 = None
    model = None
    learner = None

    # constants
    label_cols = None
    text_cols = None

    # init constructor
    def __init__(self,
                 type_pretrained='BERT',
                 text_cols="comment_text",
                 list_files=["train.csv", "test.csv"],
                 label_cols=[
                     "toxic", "severe_toxic", "obscene", "threat", "insult",
                     "identity_hate"
                 ],
                 data_root=Path("..") / "api/app/dataset/jigsaw",
                 model_dir='model',
                 batch_size=12):
        self.data_root = data_root
        self.model_dir = model_dir
        self.batch_size = batch_size
        self.label_cols = label_cols
        self.text_cols = text_cols
        self.list_files = list_files
        self.type_pretrained = type_pretrained
        gc.collect()

        log.debug('type_pretrained: ' + type_pretrained)
        if self.type_pretrained == 'BERT':
            self.tokenizer_pretrained_coruscant = BertTokenizer.from_pretrained(
                "bert-base-uncased")

    def make_model(self):
        log.debug('----- set_train_val_data ------')
        self.set_train_val_data()
        log.debug('----- set_vocab_tokenizer ------')
        self.set_vocab_tokenizer()
        log.debug('----- set_data_bunch ------')
        self.set_data_bunch()
        log.debug('----- create_model ------')
        self.create_model()
        log.debug('----- train_and_save ------')
        self.train_save()

    def set_data_bunch(self):
        self.data_bunch = TextDataBunch.from_df(
            ".",
            self.train,
            self.val,
            tokenizer=self.coruscant_tokenizer,
            vocab=self.coruscant_vocab,
            include_bos=False,
            include_eos=False,
            text_cols=self.text_cols,
            label_cols=self.label_cols,
            bs=self.batch_size,
            collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
        )

    def set_train_val_data(self):
        self.train, self.test = [
            pd.read_csv(self.data_root / fname) for fname in self.list_files
        ]
        self.train, self.val = train_test_split(self.train,
                                                shuffle=True,
                                                test_size=0.2,
                                                random_state=42)
        # log.info(self.train.head())

    def set_vocab_tokenizer(self):
        # In following code snippets, we need to wrap BERT vocab and BERT tokenizer with Fastai modules
        self.coruscant_vocab = Vocab(
            list(self.tokenizer_pretrained_coruscant.vocab.keys()))
        self.coruscant_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(
            self.tokenizer_pretrained_coruscant, max_seq_len=256),
                                             pre_rules=[],
                                             post_rules=[])

    def create_model(self):
        # BERT model
        bert_model_class = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=6)
        # Loss function to be used is Binary Cross Entropy with Logistic Losses
        loss_func = nn.BCEWithLogitsLoss()
        # Considering this is a multi-label classification problem, we cant use simple accuracy as metrics here.
        # we will use accuracy_thresh with threshold of 25% as our metric here.
        acc_02 = partial(accuracy_thresh, thresh=0.25)
        self.model = bert_model_class

        # learner function
        self.learner = Learner(self.data_bunch,
                               self.model,
                               loss_func=loss_func,
                               model_dir=self.model_dir,
                               metrics=acc_02)

    def train_save(self):
        x = bert_clas_split(self.model)
        # Let's split the model now in 6 parts
        self.learner.split([x[0], x[1], x[2], x[3], x[5]])
        self.learner.lr_find()
        self.learner.fit_one_cycle(2,
                                   max_lr=slice(1e-5, 5e-4),
                                   moms=(0.8, 0.7),
                                   pct_start=0.2,
                                   wd=(1e-7, 1e-5, 1e-4, 1e-3, 1e-2))

        self.learner.save(self.type_pretrained + '_first')
        self.learner.load(self.type_pretrained + '_first')

        # Now, we will unfreeze last two last layers and train the model again
        self.learner.freeze_to(-2)
        self.learner.fit_one_cycle(2,
                                   max_lr=slice(1e-5, 5e-4),
                                   moms=(0.8, 0.7),
                                   pct_start=0.2,
                                   wd=(1e-7, 1e-5, 1e-4, 1e-3, 1e-2))

        self.learner.save(self.type_pretrained + '_final')
        self.learner.load(self.type_pretrained + '_final')

        # We will now unfreeze the entire model and train it
        self.learner.unfreeze()
        self.learner.lr_find()
        self.learner.fit_one_cycle(2,
                                   slice(5e-6, 5e-5),
                                   moms=(0.8, 0.7),
                                   pct_start=0.2,
                                   wd=(1e-7, 1e-5, 1e-4, 1e-3, 1e-2))

    def test_prediction(self):
        # We will now see our model's prediction power
        text = 'you are so sweet'
        log.info(text)
        log.info(self.learner.predict(text))

        text = 'you are pathetic piece of shit'
        log.info(text)
        log.info(self.learner.predict(text))

        text = "what’s so great about return of the jedi?  the special effects are abysmal,  and the acting is " \
               "horrible. it’s like they phoned it in.  it’s a mess."
        log.info(text)
        log.info(self.learner.predict(text))

        text = "i hate myself for being too human.  how do i liberate my soul ?"
        log.info(text)
        log.info(self.learner.predict(text))

        text = "why was guru arjun singh killed by jahangir?"
        log.info(text)
        log.info(self.learner.predict(text))

        text = "funny how the person that bullies you in elementary is ugly as f**k in high school, and your high " \
               "school bull1, a loser in college..."
        log.info(text)
        log.info(self.learner.predict(text))

        text = "stop making fun of amy winehouse and michael jackso2, #rickcastellano is a bully."
        log.info(text)
        log.info(self.learner.predict(text))
Ejemplo n.º 2
0
def save_preds(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    try:
        df = df[['Study']]
    except:
        try:
            df = df[['Path']]
        except:
            raise ValueError('csv has no attribute for path/study.')

    for lbl in ALL_LBLS:
        df[lbl] = np.zeros(len(df))

    test = ImageDataBunch.from_df(
        path=folder_path,
        df=df,
        folder=chexpert_folder,
        seed=0,
        label_col=ALL_LBLS,
        suffix='',
        valid_pct=1,
        ds_tfms=data_tfms,
        bs=BS,
        size=IMG_SZ)  #.normalize([IMG_MEAN, IMG_STD])

    IDs, outputs = test.valid_ds.x.items, []

    learn = cnn_learner(test,
                        models.densenet121,
                        model_dir=model_path,
                        pretrained=False)
    learn.load(model_names[0])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn.load(model_names[1])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn = cnn_learner(test,
                        models.resnet152,
                        model_dir=model_path,
                        pretrained=False)
    learn.load(model_names[2])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn.load(model_names[3])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    model = resnext101_64x4d(pretrained=None)
    model.last_linear = nn.Sequential(nn.Linear(32768, 2048), nn.ReLU(True),
                                      nn.Dropout(), nn.Linear(2048, 14))
    learn = Learner(test, model, model_dir=model_path)
    learn.load(model_names[4])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn = cnn_learner(test,
                        models.vgg19_bn,
                        model_dir=model_path,
                        pretrained=False)
    learn.load(model_names[5])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn.load(model_names[6])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    learn = cnn_learner(test,
                        models.densenet121,
                        model_dir=model_path,
                        pretrained=False)
    learn.load(model_names[7])
    output, y, _ = learn.get_preds(ds_type=DatasetType.Valid, with_loss=True)
    outputs.append(output)

    output = ensemble_method(outputs, mode='avg')
    if torch.cuda.is_available():
        output = output.cpu()
    output = output.numpy()

    df = pd.DataFrame({
        'Path': IDs,
        EVAL_LBLS[0]: output[:, 1],
        EVAL_LBLS[1]: output[:, 2],
        EVAL_LBLS[2]: output[:, 3],
        EVAL_LBLS[3]: output[:, 4],
        EVAL_LBLS[4]: output[:, 5]
    })

    df.to_csv(output_csv, index=False)
    print('submission saved.')
Ejemplo n.º 3
0
from utils import get_databunch
from make_vgg_resnet import VGG_ResNet

if __name__ == "__main__":
    datasetdir = os.path.join(os.path.dirname(__file__), './kuzu_mnist')
    datasetdir = os.path.abspath(datasetdir)

    # Load dataset
    databunch = get_databunch(datasetdir)

    # Create VGG + ResNet model
    learn = Learner(databunch, VGG_ResNet(), metrics=accuracy)

    # Load
    learn.load('vgg_resnet_model_with_norm')

    # Validate
    loss, acc = learn.validate()
    print('val_loss: {}, val_acc: {}'.format(loss, acc))

    mat = np.zeros((10, 10))
    for data in databunch.valid_ds:
        images, labels = data
        images = images.reshape((1, 1, 28, 28))
        outputs = learn.model(images)
        _, predicted = torch.max(outputs, 1)
        predicted = int(predicted)
        mat[labels][predicted] += 1

    print(mat)