Ejemplo n.º 1
0
    def _train_classifier(self, data_class, encoder_name, classifier_name):
        learn = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.5, pretrained=False)
        try:
            path = f'{classifier_name}'
            learn.load(path)
            self.log('Loaded pretrained classifier from {}'.format(str(path)))
            # return learn
        except FileNotFoundError:
            self.log(f'Training classifier `{classifier_name}`')
            learn.load_encoder(encoder_name)
            lr = 1e-1/LR_DIV_FACTOR
            learn.fit_one_cycle(1, lr, moms=(0.8, 0.7))

            # TODO remove
            # learn.save(classifier_name)
            # return learn
            learn.freeze_to(-2)
            lr /= 2
            learn.fit_one_cycle(1, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))

            learn.freeze_to(-3)
            lr /= 2
            learn.fit_one_cycle(1, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))

            learn.unfreeze()
            lr /= 5
            learn.fit_one_cycle(2, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))
            self.log(f"Saving classifier `{classifier_name}`")
            learn.save(classifier_name)
        return learn
Ejemplo n.º 2
0
def train(bs):
    path = Path("./")
    data_lm = load_data(path, 'data_lm.pkl', bs=bs)
    print("data_lm loaded")

    data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab)
                 .split_from_df(col=3)
                 .label_from_df(cols=0)
                 .databunch(bs=bs))

    print("data_clas loaded")
    data_clas.show_batch()  # not sure how it will work

    data_clas.save('data_clas.pkl')

    learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6)
    learn.load_encoder('fine_tuned_enc')

    lr_estimate = 1.0e-2

    learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7))

    learn.save('first')
    losses_fig = learn.recorder.plot_losses(return_fig=True)
    losses_fig.savefig("losses_001.jpg", dpi=600)
Ejemplo n.º 3
0
 def _fit_class(self, df_train, df_val, data_lm):
     n_data = min(len(df_train), len(df_val))
     # Classifier model data
     data_class = TextClasDataBunch.from_df(
         path="",
         train_df=df_train,
         valid_df=df_val,
         vocab=data_lm.train_ds.vocab,
         bs=self.batch_size if self.batch_size < n_data else n_data // 2,
     )
     # train the learner object
     class_learner = text_classifier_learner(
         data_class, self.arch, drop_mult=self.dropout_lm
     )
     class_learner.load_encoder(self.path_lm.name)
     class_learner.fit_one_cycle(1, self.lr_class)
     class_learner.export(self.path_class)
Ejemplo n.º 4
0
 def _create_classifier_learner(self, data_lm, df_train, df_val, tokenizer,
                                encoder_name):
     data_class = ftext.TextClasDataBunch.from_df(
         '',
         vocab=data_lm.train_ds.vocab,
         bs=32,
         train_df=df_train,
         valid_df=df_val,
         text_cols=0,
         label_cols=1,
         tokenizer=tokenizer,
         label_delim=' ')
     config = ftext.awd_lstm_clas_config.copy()
     config['n_hid'] = 1150
     learn = ftext.text_classifier_learner(data_class,
                                           ftext.AWD_LSTM,
                                           config=config,
                                           drop_mult=0.5)
     learn.load_encoder(encoder_name)
     return learn
Ejemplo n.º 5
0
import ujson as json
import spacy
import fastai
from pathlib import Path
from fastai.text import load_data, text_classifier_learner, AWD_LSTM
from typing import Tuple

bs=48
path=Path('/app/model/sentiment')
data_clas = load_data(path, 'data_clas.pkl', bs=bs)
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load('third')
learn.model.eval()
nlp = spacy.load('en_core_web_md')

def message_to_sentences(message:str, minimum_characters:int=5) -> str:
    text = message
    sentences = []
    for line in text.split('\n'):
        sentences += line.split('.')
    return [s for s in sentences if len(s) > minimum_characters]

def predict_sentiment(sentence:str) -> Tuple[str, float]:
    categorical, class_id, scores = learn.predict(sentence)
    score = round(scores[class_id].item(), 4)
    return "negative" if class_id == 0 else "positive", score

def model(message):
    sentence = json.loads(message)
    doc = nlp(sentence)
    entities = nlp(sentence).ents
Ejemplo n.º 6
0
SAMPLES_PER_CLASS = 12500

print('loading data')
texts = []
target = []

for class_index, classname in enumerate(CLASS_NAMES):

    for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')):

        texts.append(preprocess_string(line,False))
        target.append(class_index)

        if n > SAMPLES_PER_CLASS:
            break

df = DataFrame({'label':target,'text':texts})
df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "")
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)
learn.fit_one_cycle(1, 1e-2)
learn.save_encoder('ft_enc')

learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-2)

    lm_learner.recorder.plot(suggestion=True)

    lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr)

    lm_learner.save_encoder(model)

    text_clas = TextClasDataBunch.from_df(
        train_df=train_df,
        valid_df=valid_df,
        vocab=text_lm.train_ds.vocab,
        path="",
    )

    clf = text_classifier_learner(
        text_clas,
        arch=AWD_LSTM,
        drop_mult=0.2,
    )
    clf.load_encoder(model)

    clf.lr_find()
    clf.recorder.plot(suggestion=True)

    clf.fit_one_cycle(1, clf.recorder.min_grad_lr)

    print(lm_learner.predict("green bell"))

    print(text_clas.train_ds.y.c2i)

    print(clf.predict("wrap avocado beef"))
Ejemplo n.º 8
0
def new_train_clas(data_dir,
                   lang='en',
                   cuda_id=0,
                   pretrain_name='wt103',
                   model_dir='models',
                   qrnn=False,
                   fine_tune=True,
                   max_vocab=30000,
                   bs=20,
                   bptt=70,
                   name='imdb-clas',
                   dataset='imdb',
                   ds_pct=1.0):
    """
    :param data_dir: The path to the `data` directory
    :param lang: the language unicode
    :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when
                    run on CPU.
    :param pretrain_name: name of the pretrained model
    :param model_dir: The path to the directory where the pretrained model is saved
    :param qrrn: Use a QRNN. Requires installing cupy.
    :param fine_tune: Fine-tune the pretrained language model
    :param max_vocab: The maximum size of the vocabulary.
    :param bs: The batch size.
    :param bptt: The back-propagation-through-time sequence length.
    :param name: The name used for both the model and the vocabulary.
    :param dataset: The dataset used for evaluation. Currently only IMDb and
                    XNLI are implemented. Assumes dataset is located in `data`
                    folder and that name of folder is the same as dataset name.
    """
    results = {}
    if not torch.cuda.is_available():
        print('CUDA not available. Setting device=-1.')
        cuda_id = -1
    torch.cuda.set_device(cuda_id)

    print(f'Dataset: {dataset}. Language: {lang}.')
    assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.'
    assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\
        'Error: IMDb is only available in English.'

    data_dir = Path(data_dir)
    assert data_dir.name == 'data',\
        f'Error: Name of data directory should be data, not {data_dir.name}.'
    dataset_dir = data_dir / dataset
    model_dir = Path(model_dir)

    if qrnn:
        print('Using QRNNs...')
    model_name = 'qrnn' if qrnn else 'lstm'
    lm_name = f'{model_name}_{pretrain_name}'
    pretrained_fname = (lm_name, f'itos_{pretrain_name}')

    ensure_paths_exists(data_dir, dataset_dir, model_dir,
                        model_dir / f"{pretrained_fname[0]}.pth",
                        model_dir / f"{pretrained_fname[1]}.pkl")

    tmp_dir = dataset_dir / 'tmp'
    tmp_dir.mkdir(exist_ok=True)
    vocab_file = tmp_dir / f'vocab_{lang}.pkl'

    if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists():
        print('Reading the data...')
        toks, lbls = read_clas_data(dataset_dir, dataset, lang)

        # create the vocabulary
        counter = Counter(word for example in toks[TRN] for word in example)
        itos = [word for word, count in counter.most_common(n=max_vocab)]
        itos.insert(0, PAD)
        itos.insert(0, UNK)
        vocab = Vocab(itos)
        stoi = vocab.stoi
        with open(vocab_file, 'wb') as f:
            pickle.dump(vocab, f)

        ids = {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s])
                                   for s in toks[split]])
            np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split])
            np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split])
    else:
        print('Loading the pickled data...')
        ids, lbls = {}, {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy')
            lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy')
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)

    print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. '
          f'Test size: {len(ids[TST])}.')

    if ds_pct < 1.0:
        print(f"Makeing the dataset smaller {ds_pct}")
        for split in [TRN, VAL, TST]:
            ids[split] = ids[split][:int(len(ids[split]) * ds_pct)]

    data_lm = TextLMDataBunch.from_ids(path=tmp_dir,
                                       vocab=vocab,
                                       train_ids=ids[TRN],
                                       valid_ids=ids[VAL],
                                       bs=bs,
                                       bptt=bptt)

    # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls?
    data_clas = TextClasDataBunch.from_ids(path=tmp_dir,
                                           vocab=vocab,
                                           train_ids=ids[TRN],
                                           valid_ids=ids[VAL],
                                           train_lbls=lbls[TRN],
                                           valid_lbls=lbls[VAL],
                                           bs=bs)

    if qrnn:
        emb_sz, nh, nl = 400, 1550, 3
    else:
        emb_sz, nh, nl = 400, 1150, 3
    learn = language_model_learner(data_lm,
                                   bptt=bptt,
                                   emb_sz=emb_sz,
                                   nh=nh,
                                   nl=nl,
                                   qrnn=qrnn,
                                   pad_token=PAD_TOKEN_ID,
                                   pretrained_fnames=pretrained_fname,
                                   path=model_dir.parent,
                                   model_dir=model_dir.name)
    lm_enc_finetuned = f"{lm_name}_{dataset}_enc"
    if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists():
        print('Fine-tuning the language model...')
        learn.unfreeze()
        learn.fit(2, slice(1e-4, 1e-2))

        # save encoder
        learn.save_encoder(lm_enc_finetuned)

    print("Starting classifier training")
    learn = text_classifier_learner(data_clas,
                                    bptt=bptt,
                                    pad_token=PAD_TOKEN_ID,
                                    path=model_dir.parent,
                                    model_dir=model_dir.name,
                                    qrnn=qrnn,
                                    emb_sz=emb_sz,
                                    nh=nh,
                                    nl=nl)

    learn.load_encoder(lm_enc_finetuned)

    learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7)

    learn.freeze_to(-2)
    learn.fit_one_cycle(1,
                        slice(1e-2 / (2.6**4), 1e-2),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.freeze_to(-3)
    learn.fit_one_cycle(1,
                        slice(5e-3 / (2.6**4), 5e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.unfreeze()
    learn.fit_one_cycle(2,
                        slice(1e-3 / (2.6**4), 1e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)
    results['accuracy'] = learn.validate()[1]
    print(f"Saving models at {learn.path / learn.model_dir}")
    learn.save(f'{model_name}_{name}')
    return results