Ejemplo n.º 1
0
 def __init__(self, lang: str = 'en'):
     default_post_rules = [replace_all_caps, deal_caps]
     default_pre_rules = [
         fix_html, replace_rep, replace_wrep, spec_add_spaces,
         rm_useless_spaces
     ]
     self.tok = Tokenizer(lang=lang,
                          pre_rules=default_pre_rules,
                          post_rules=default_post_rules)
Ejemplo n.º 2
0
class TextPreprocessor(object):
    """"
	Normalizes and tokenizes text
	"""

    def __init__(self, ulmfit_preprocessing=True, lang="en"):
        self._spacy_tagger = spacy.load(lang, disable=["parser", "ner"])
        self._spacy_tagger.add_pipe(self._spacy_tagger.create_pipe("sentencizer"))

        self.ulmfit_preprocessing = ulmfit_preprocessing
        if ulmfit_preprocessing:
            self.ulmfit_tokenizer = ULMFiTTokenizer()
            self.s = SpacyTokenizer(lang)  # used by ulmfit tokenizer

    def tokenize_text(self, text: str):
        text = fixup(text)
        if not self.ulmfit_preprocessing:  # standard preprocessing
            text = lowercase_and_remove_accent(text)
            text = remove_non_printing_char(text)
            text = replace_unicode_punct(text)

        else:  # ULMFiT-specific preprocessing
            text = " ".join(self.ulmfit_tokenizer.process_text(text, self.s))

        return [[str(t) for t in sent] for sent in self._spacy_tagger(text).sents]
Ejemplo n.º 3
0
    def fit(self):

        self.create_data()

        tokenizer = Tokenizer(lang='xx')
        data_lm = fastai.text.data.TextLMDataBunch.from_df(
            self.path,
            tokenizer=tokenizer,
            bs=16,
            train_df=self.df_train,
            valid_df=self.df_val,
            text_cols=0)
        print('batches formed')

        data_test_clas = fastai.text.data.TextClasDataBunch.from_df(
            self.path,
            vocab=data_lm.train_ds.vocab,
            bs=32,
            train_df=self.df_train,
            valid_df=self.df_val,
            text_cols=0,
            label_cols=1,
            tokenizer=tokenizer)

        config = fastai.text.models.awd_lstm_clas_config.copy()
        config['n_hid'] = 1150
        self.learn_test = text_classifier_learner(data_test_clas,
                                                  AWD_LSTM,
                                                  config=config,
                                                  drop_mult=0.5)

        self.learn_test.load_encoder('/home/victor/fb/ft_enc')
        self.learn_test.load('/home/victor/fb/tw_lstm')

        print('model learned')
Ejemplo n.º 4
0
def get_sentencepiece(path: PathOrStr,
                      trn_path: Path,
                      name: str,
                      pre_rules: ListRules = None,
                      post_rules: ListRules = None,
                      vocab_size: int = 30000,
                      model_type: str = 'unigram',
                      input_sentence_size: int = 1E7,
                      pad_idx: int = PAD_TOKEN_ID):
    try:
        import sentencepiece as spm
    except ImportError:
        raise Exception(
            'sentencepiece module is missing: run `pip install sentencepiece`')

    path = pathlib.Path(path)
    cache_name = 'tmp'
    os.makedirs(path / cache_name, exist_ok=True)
    os.makedirs(path / 'models', exist_ok=True)
    pre_rules = pre_rules if pre_rules is not None else []
    post_rules = post_rules if post_rules is not None else []

    # load the text frmo the train tokens file
    text = [line.rstrip('\n') for line in open(trn_path)]
    text = list(filter(None, text))

    if not os.path.isfile(path / 'models' / 'spm.model') or not os.path.isfile(
            path / 'models' / f'itos_{name}.pkl'):
        raw_text = reduce(lambda t, rule: rule(t), pre_rules, '\n'.join(text))
        raw_text_path = path / cache_name / 'all_text.txt'
        with open(raw_text_path, 'w') as f:
            f.write(raw_text)

        sp_params = f"--input={raw_text_path} --pad_id={pad_idx} --unk_id=0 " \
                    f"--character_coverage=1.0 --bos_id=-1 --eos_id=-1 " \
                    f"--input_sentence_size={int(input_sentence_size)} " \
                    f"--model_prefix={path / 'models' / 'spm'} " \
                    f"--vocab_size={vocab_size} --model_type={model_type} "
        spm.SentencePieceTrainer.Train(sp_params)

        with open(path / 'models' / 'spm.vocab', 'r') as f:
            vocab = [line.split('\t')[0] for line in f.readlines()]
            vocab[0] = UNK
            vocab[pad_idx] = PAD

        pickle.dump(vocab, open(path / 'models' / f'itos_{name}.pkl', 'wb'))
    # todo add post rules
    vocab = Vocab(pickle.load(open(path / 'models' / f'itos_{name}.pkl',
                                   'rb')))
    # We cannot use lambdas or local methods here, since `tok_func` needs to be
    # pickle-able in order to be called in subprocesses when multithread tokenizing
    tokenizer = Tokenizer(tok_func=SentencepieceTokenizer,
                          lang=str(path / 'models'),
                          pre_rules=pre_rules,
                          post_rules=post_rules)

    clear_cache_directory(path, cache_name)

    return {'tokenizer': tokenizer, 'vocab': vocab}
Ejemplo n.º 5
0
    def __init__(self, ulmfit_preprocessing=True, lang="en"):
        self._spacy_tagger = spacy.load(lang, disable=["parser", "ner"])
        self._spacy_tagger.add_pipe(self._spacy_tagger.create_pipe("sentencizer"))

        self.ulmfit_preprocessing = ulmfit_preprocessing
        if ulmfit_preprocessing:
            self.ulmfit_tokenizer = ULMFiTTokenizer()
            self.s = SpacyTokenizer(lang)  # used by ulmfit tokenizer
Ejemplo n.º 6
0
 def get_processors_for_clas(vocab):
     tokenizer = Tokenizer(post_rules=[
         replace_all_caps, deal_caps, DashInsights.limit_tokens
     ])
     return [
         TokenizeProcessor(tokenizer=tokenizer),
         NumericalizeProcessor(vocab=vocab)
     ]
Ejemplo n.º 7
0
class TextTokenizer:
    def __init__(self, lang: str = 'en'):
        default_post_rules = [replace_all_caps, deal_caps]
        default_pre_rules = [
            fix_html, replace_rep, replace_wrep, spec_add_spaces,
            rm_useless_spaces
        ]
        self.tok = Tokenizer(lang=lang,
                             pre_rules=default_pre_rules,
                             post_rules=default_post_rules)

    def process_all(self, texts: list):
        return self.tok.process_all(texts)