def __init__(self, lang: str = 'en'): default_post_rules = [replace_all_caps, deal_caps] default_pre_rules = [ fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces ] self.tok = Tokenizer(lang=lang, pre_rules=default_pre_rules, post_rules=default_post_rules)
class TextPreprocessor(object): """" Normalizes and tokenizes text """ def __init__(self, ulmfit_preprocessing=True, lang="en"): self._spacy_tagger = spacy.load(lang, disable=["parser", "ner"]) self._spacy_tagger.add_pipe(self._spacy_tagger.create_pipe("sentencizer")) self.ulmfit_preprocessing = ulmfit_preprocessing if ulmfit_preprocessing: self.ulmfit_tokenizer = ULMFiTTokenizer() self.s = SpacyTokenizer(lang) # used by ulmfit tokenizer def tokenize_text(self, text: str): text = fixup(text) if not self.ulmfit_preprocessing: # standard preprocessing text = lowercase_and_remove_accent(text) text = remove_non_printing_char(text) text = replace_unicode_punct(text) else: # ULMFiT-specific preprocessing text = " ".join(self.ulmfit_tokenizer.process_text(text, self.s)) return [[str(t) for t in sent] for sent in self._spacy_tagger(text).sents]
def fit(self): self.create_data() tokenizer = Tokenizer(lang='xx') data_lm = fastai.text.data.TextLMDataBunch.from_df( self.path, tokenizer=tokenizer, bs=16, train_df=self.df_train, valid_df=self.df_val, text_cols=0) print('batches formed') data_test_clas = fastai.text.data.TextClasDataBunch.from_df( self.path, vocab=data_lm.train_ds.vocab, bs=32, train_df=self.df_train, valid_df=self.df_val, text_cols=0, label_cols=1, tokenizer=tokenizer) config = fastai.text.models.awd_lstm_clas_config.copy() config['n_hid'] = 1150 self.learn_test = text_classifier_learner(data_test_clas, AWD_LSTM, config=config, drop_mult=0.5) self.learn_test.load_encoder('/home/victor/fb/ft_enc') self.learn_test.load('/home/victor/fb/tw_lstm') print('model learned')
def get_sentencepiece(path: PathOrStr, trn_path: Path, name: str, pre_rules: ListRules = None, post_rules: ListRules = None, vocab_size: int = 30000, model_type: str = 'unigram', input_sentence_size: int = 1E7, pad_idx: int = PAD_TOKEN_ID): try: import sentencepiece as spm except ImportError: raise Exception( 'sentencepiece module is missing: run `pip install sentencepiece`') path = pathlib.Path(path) cache_name = 'tmp' os.makedirs(path / cache_name, exist_ok=True) os.makedirs(path / 'models', exist_ok=True) pre_rules = pre_rules if pre_rules is not None else [] post_rules = post_rules if post_rules is not None else [] # load the text frmo the train tokens file text = [line.rstrip('\n') for line in open(trn_path)] text = list(filter(None, text)) if not os.path.isfile(path / 'models' / 'spm.model') or not os.path.isfile( path / 'models' / f'itos_{name}.pkl'): raw_text = reduce(lambda t, rule: rule(t), pre_rules, '\n'.join(text)) raw_text_path = path / cache_name / 'all_text.txt' with open(raw_text_path, 'w') as f: f.write(raw_text) sp_params = f"--input={raw_text_path} --pad_id={pad_idx} --unk_id=0 " \ f"--character_coverage=1.0 --bos_id=-1 --eos_id=-1 " \ f"--input_sentence_size={int(input_sentence_size)} " \ f"--model_prefix={path / 'models' / 'spm'} " \ f"--vocab_size={vocab_size} --model_type={model_type} " spm.SentencePieceTrainer.Train(sp_params) with open(path / 'models' / 'spm.vocab', 'r') as f: vocab = [line.split('\t')[0] for line in f.readlines()] vocab[0] = UNK vocab[pad_idx] = PAD pickle.dump(vocab, open(path / 'models' / f'itos_{name}.pkl', 'wb')) # todo add post rules vocab = Vocab(pickle.load(open(path / 'models' / f'itos_{name}.pkl', 'rb'))) # We cannot use lambdas or local methods here, since `tok_func` needs to be # pickle-able in order to be called in subprocesses when multithread tokenizing tokenizer = Tokenizer(tok_func=SentencepieceTokenizer, lang=str(path / 'models'), pre_rules=pre_rules, post_rules=post_rules) clear_cache_directory(path, cache_name) return {'tokenizer': tokenizer, 'vocab': vocab}
def __init__(self, ulmfit_preprocessing=True, lang="en"): self._spacy_tagger = spacy.load(lang, disable=["parser", "ner"]) self._spacy_tagger.add_pipe(self._spacy_tagger.create_pipe("sentencizer")) self.ulmfit_preprocessing = ulmfit_preprocessing if ulmfit_preprocessing: self.ulmfit_tokenizer = ULMFiTTokenizer() self.s = SpacyTokenizer(lang) # used by ulmfit tokenizer
def get_processors_for_clas(vocab): tokenizer = Tokenizer(post_rules=[ replace_all_caps, deal_caps, DashInsights.limit_tokens ]) return [ TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab) ]
class TextTokenizer: def __init__(self, lang: str = 'en'): default_post_rules = [replace_all_caps, deal_caps] default_pre_rules = [ fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces ] self.tok = Tokenizer(lang=lang, pre_rules=default_pre_rules, post_rules=default_post_rules) def process_all(self, texts: list): return self.tok.process_all(texts)