def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Load a symspell Spell Corrector for Malay. Returns ------- result: malaya.spell.Symspell class """ try: from symspellpy.symspellpy import SymSpell, Verbosity except BaseException: raise ModuleNotFoundError( 'symspellpy not installed. Please install it and try again.') path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(path['model'], term_index, count_index) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
def fasttext(quantization: bool = True, **kwargs): """ Load Fasttext language detection model. Parameters ---------- quantization: bool, optional (default=True) if True, load quantized fasttext model. Else, load original fasttext model. Returns ------- result : malaya.model.ml.LANGUAGE_DETECTION class """ try: import fasttext except: raise ModuleNotFoundError( 'fasttext not installed. Please install it by `pip install fasttext` and try again.' ) if quantization: model = 'fasttext-quantized' else: model = 'fasttext-original' check_file(PATH_LANG_DETECTION[model], S3_PATH_LANG_DETECTION[model], **kwargs) try: model_fasttext = fasttext.load_model( PATH_LANG_DETECTION[model]['model']) except: raise ValueError( f"model corrupted due to some reasons, please run malaya.clear_cache('language-detection/{model}') and try again" ) return LANGUAGE_DETECTION(model_fasttext, lang_labels)
def multinomial(path, s3_path, class_name, label, **kwargs): check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/multinomial') and try again" ) bpe, subword_mode = load_yttm(path['multinomial']['bpe']) from malaya.stem import _classification_textcleaning_stemmer if len(label) > 2: selected_class = MULTICLASS_BAYES else: selected_class = BINARY_BAYES return selected_class( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, subword_mode=subword_mode, cleaning=_classification_textcleaning_stemmer, )
def segmenter(max_split_length: int = 20, validate: bool = True): """ Load Segmenter class. Parameters ---------- max_split_length: int, (default=20) max length of words in a sentence to segment validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _Segmenter : malaya.preprocessing._Segmenter class """ if validate: check_file(PATH_PREPROCESSING[1], S3_PATH_PREPROCESSING[1]) else: if not check_available(PATH_PREPROCESSING[1]): raise Exception( 'preprocessing is not available, please `validate = True`') if validate: check_file(PATH_PREPROCESSING[2], S3_PATH_PREPROCESSING[2]) else: if not check_available(PATH_PREPROCESSING[2]): raise Exception( 'preprocessing is not available, please `validate = True`') return _Segmenter(max_split_length=max_split_length)
def load_lm(path, s3_path, model, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) X = g.get_tensor_by_name('import/Placeholder:0') top_p = g.get_tensor_by_name('import/Placeholder_2:0') greedy = g.get_tensor_by_name('import/greedy:0') beam = g.get_tensor_by_name('import/beam:0') nucleus = g.get_tensor_by_name('import/nucleus:0') tokenizer = SentencePieceEncoder(path[model]['vocab']) return model_class( X=X, top_p=top_p, greedy=greedy, beam=beam, nucleus=nucleus, sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, )
def transformer(model, sentence_piece: bool = False, **kwargs): """ Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT. Parameters ---------- validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- _TransformerCorrector: malaya.spell._TransformerCorrector class """ if not hasattr(model, '_log_vectorize'): raise ValueError('model must has `_log_vectorize` method') check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab, vocab_model) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return _TransformerCorrector(model, corpus, tokenizer)
def multinomial(path, s3_path, class_name, label, sigmoid=False, **kwargs): check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{class_name}/multinomial')` and try again" ) bpe, subword_mode = load_yttm(path['multinomial']['bpe']) stemmer = naive() cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer) if sigmoid: selected_class = MultilabelBayes else: if len(label) > 2: selected_class = MulticlassBayes else: selected_class = BinaryBayes return selected_class( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, subword_mode=subword_mode, cleaning=cleaning, )
def transformer(model, sentence_piece: bool = False, **kwargs): """ Load a Transformer Spell Corrector. Right now only supported BERT and ALBERT. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.TRANSFORMER class """ if not hasattr(model, '_log_vectorize'): raise ValueError('model must has `_log_vectorize` method') check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab_model, vocab) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return TRANSFORMER(model, corpus, tokenizer)
def fasttext(quantized: bool = True, **kwargs): """ Load Fasttext language detection model. Original size is 353MB, Quantized size 31.1MB. Parameters ---------- quantized: bool, optional (default=True) if True, load quantized fasttext model. Else, load original fasttext model. Returns ------- result : malaya.model.ml.LanguageDetection class """ try: import fasttext except BaseException: raise ModuleNotFoundError( 'fasttext not installed. Please install it by `pip install fasttext` and try again.' ) if quantized: model = 'fasttext-quantized' else: model = 'fasttext-original' check_file( PATH_LANG_DETECTION[model], S3_PATH_LANG_DETECTION[model], **kwargs ) model_fasttext = fasttext.load_model( PATH_LANG_DETECTION[model]['model'] ) return LanguageDetection(model_fasttext, lang_labels)
def multinomial(path, s3_path, module, label, sigmoid=False, **kwargs): check_file(path['multinomial'], s3_path['multinomial'], **kwargs) try: with open(path['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(path['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except BaseException: path = os.path.normpath(f'{module}/multinomial') raise Exception( f"model corrupted due to some reasons, please run `malaya.clear_cache('{path}')` and try again" ) bpe = YTTMEncoder(vocab_file=path['multinomial']['bpe']) stemmer = naive() cleaning = partial(_classification_textcleaning_stemmer, stemmer=stemmer) if sigmoid: selected_model = MultilabelBayes else: if len(label) > 2: selected_model = MulticlassBayes else: selected_model = BinaryBayes return selected_model( multinomial=multinomial, label=label, vectorize=vectorize, bpe=bpe, cleaning=cleaning, )
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Train a symspell Spell Corrector. Returns ------- result: malaya.spell.SYMSPELL class """ check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) try: from symspellpy.symspellpy import SymSpell, Verbosity except: raise Exception( 'symspellpy not installed. Please install it and try again.') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = PATH_NGRAM['symspell']['model'] sym_spell.load_dictionary(dictionary_path, term_index, count_index) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
def load(model: str = 'wikipedia', **kwargs): """ Return malaya.wordvector.WordVector object. Parameters ---------- model : str, optional (default='wikipedia') Model architecture supported. Allowed values: * ``'wikipedia'`` - pretrained on Malay wikipedia word2vec size 256. * ``'socialmedia'`` - pretrained on cleaned Malay twitter and Malay instagram size 256. * ``'news'`` - pretrained on cleaned Malay news size 256. * ``'combine'`` - pretrained on cleaned Malay news + Malay social media + Malay wikipedia size 256. Returns ------- vocabulary: indices dictionary for `vector`. vector: np.array, 2D. """ model = model.lower() if model not in _wordvector_availability: raise ValueError( 'model not supported, please check supported models from `malaya.wordvector.available_wordvector()`.' ) check_file(PATH_WORDVECTOR[model], S3_PATH_WORDVECTOR[model], **kwargs) return _load(PATH_WORDVECTOR[model]['vocab'], PATH_WORDVECTOR[model]['model'])
def load(path, s3_path, model, encoder, model_class, quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if encoder == 'subword': encoder = text_encoder.SubwordTextEncoder(path[model]['vocab']) if encoder == 'yttm': bpe, subword_mode = load_yttm(path[model]['vocab'], True) encoder = YTTMEncoder(bpe, subword_mode) return model_class( X=g.get_tensor_by_name('import/Placeholder:0'), greedy=g.get_tensor_by_name('import/greedy:0'), beam=g.get_tensor_by_name('import/beam:0'), sess=generate_session(graph=g, **kwargs), encoder=encoder, )
def _transformer(model, bert_class, xlnet_class, quantized=False, siamese=False, **kwargs): model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from `malaya.similarity.available_transformer()`.' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_SIMILARITY[model][model_path], **kwargs) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) selected_class = bert_class if siamese: selected_node = 'import/bert/pooler/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) selected_class = xlnet_class if siamese: selected_node = 'import/model_1/sequnece_summary/summary/BiasAdd:0' if not siamese: selected_node = _vectorizer_mapping[model] return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def deep_model(**kwargs): """ Load deep learning language detection model. Returns ------- result : malaya.model.tf.DEEP_LANG class """ check_file(PATH_LANG_DETECTION['deep'], S3_PATH_LANG_DETECTION['deep'], **kwargs) try: with open(PATH_LANG_DETECTION['deep']['vector'], 'rb') as fopen: vector = pickle.load(fopen) except: raise ValueError( "model corrupted due to some reasons, please run malaya.clear_cache('language-detection/deep') and try again" ) from malaya.text.bpe import load_yttm bpe, subword_mode = load_yttm(PATH_LANG_DETECTION['deep']['bpe']) import os return DEEP_LANG( os.path.dirname(PATH_LANG_DETECTION['deep']['model']), vector, lang_labels, bpe, subword_mode, )
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.PROBABILITY class """ check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) tokenizer = None if sentence_piece: check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) vocab = PATH_NGRAM['sentencepiece']['vocab'] vocab_model = PATH_NGRAM['sentencepiece']['model'] tokenizer = load_sentencepiece(vocab_model, vocab) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return PROBABILITY(corpus, tokenizer)
def probability(sentence_piece: bool = False, **kwargs): """ Train a Probability Spell Corrector. Parameters ---------- sentence_piece: bool, optional (default=False) if True, reduce possible augmentation states using sentence piece. Returns ------- result: malaya.spell.Probability class """ tokenizer = None if sentence_piece: path = check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'], **kwargs) print(path) vocab = path['vocab'] vocab_model = path['model'] tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Probability(corpus, tokenizer)
def multinomial(**kwargs): """ Load multinomial toxicity model. Parameters ---------- validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- BAYES : malaya._models._sklearn_model.MULTILABEL_BAYES class """ import pickle check_file(PATH_TOXIC['multinomial'], S3_PATH_TOXIC['multinomial'], **kwargs) try: with open(PATH_TOXIC['multinomial']['model'], 'rb') as fopen: multinomial = pickle.load(fopen) with open(PATH_TOXIC['multinomial']['vector'], 'rb') as fopen: vectorize = pickle.load(fopen) except: raise Exception( "model corrupted due to some reasons, please run malaya.clear_cache('toxic/multinomial') and try again" ) from .stem import _classification_textcleaning_stemmer return MULTILABEL_BAYES( models=multinomial, vectors=vectorize, cleaning=_classification_textcleaning_stemmer, )
def synonym(string: str, threshold: float = 0.5, top_n=5, cleaning=augmentation_textcleaning, **kwargs): """ augmenting a string using synonym, https://github.com/huseinzol05/Malaya-Dataset#90k-synonym Parameters ---------- string: str threshold: float, optional (default=0.5) random selection for a word. top_n: int, (default=5) number of nearest neighbors returned. Length of returned result should as top_n. cleaning: function, (default=malaya.text.function.augmentation_textcleaning) function to clean text. Returns ------- result: List[str] """ if not isinstance(cleaning, Callable) and cleaning is not None: raise ValueError('cleaning must be a callable type or None') global _synonym_dict if _synonym_dict is None: check_file(PATH_AUGMENTATION['synonym'], S3_PATH_AUGMENTATION['synonym'], **kwargs) synonyms = defaultdict(list) files = [ PATH_AUGMENTATION['synonym']['model'], PATH_AUGMENTATION['synonym']['model2'], ] for file in files: with open(file) as fopen: data = json.load(fopen) for i in data: if not len(i[1]): continue synonyms[i[0]].extend(i[1]) for r in i[1]: synonyms[r].append(i[0]) for k, v in synonyms.items(): synonyms[k] = list(set(v)) _synonym_dict = synonyms original_string = string if cleaning: string = cleaning(string).split() augmented = [] for i in range(top_n): string_ = replace(string, threshold) augmented.append( _make_upper(' '.join(string_), ' '.join(original_string))) return augmented
def t5(model: str = 'base', **kwargs): """ Load T5 model to generate a string given a isu penting. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'base'`` - T5 Base parameters. * ``'small'`` - T5 Small parameters. Returns ------- result: malaya.model.t5.GENERATOR class """ model = model.lower() if model not in _t5_availability: raise Exception( 'model not supported, please check supported models from malaya.generator.available_t5()' ) from malaya.path import PATH_GENERATOR, S3_PATH_GENERATOR path = PATH_GENERATOR['sample'] s3_path = S3_PATH_GENERATOR['sample'] from malaya.function import check_file try: import tensorflow_text import tf_sentencepiece import tensorflow as tf except: raise Exception( 'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text tf-sentencepiece` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) check_file(path[model]['model'], s3_path[model], **kwargs) if not os.path.exists(path[model]['directory'] + 'saved_model.pb'): import tarfile with tarfile.open(path[model]['model']['model']) as tar: tar.extractall(path=path[model]['path']) sess = tf.InteractiveSession() meta_graph_def = tf.compat.v1.saved_model.load(sess, ['serve'], path[model]['directory']) signature_def = meta_graph_def.signature_def['serving_default'] pred = lambda x: sess.run( fetches=signature_def.outputs['outputs'].name, feed_dict={signature_def.inputs['input'].name: x}, ) from malaya.model.t5 import GENERATOR return GENERATOR(pred)
def load(path, s3_path, model, model_class, compressed=True, quantized=False, **kwargs): try: import tensorflow_text import tf_sentencepiece except: raise ModuleNotFoundError( 'tensorflow-text and tf-sentencepiece not installed. Please install it by `pip install tensorflow-text==1.15.0 tf-sentencepiece==0.1.86` and try again. Also, make sure tensorflow-text version same as tensorflow version.' ) if compressed and not quantized: path = path['t5-compressed'] s3_path = s3_path['t5-compressed'] check_file(path[model]['model'], s3_path[model], **kwargs) if not os.path.exists(path[model]['directory'] + 'saved_model.pb'): import tarfile with tarfile.open(path[model]['model']['model']) as tar: tar.extractall(path=path[model]['path']) X = None decode = None sess = generate_session(graph=None, **kwargs) meta_graph_def = tf.compat.v1.saved_model.load( sess, ['serve'], path[model]['directory']) signature_def = meta_graph_def.signature_def['serving_default'] pred = lambda x: sess.run( fetches=signature_def.outputs['outputs'].name, feed_dict={signature_def.inputs['input'].name: x}, ) else: path = path['t5'] s3_path = s3_path['t5'] check_file(path[model], s3_path[model], quantized=quantized, optimized=True, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) X = g.get_tensor_by_name('import/inputs:0') decode = g.get_tensor_by_name( 'import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0' ) sess = generate_session(graph=g, **kwargs) pred = None return model_class(X=X, decode=decode, sess=sess, pred=pred)
def load(model: str = 'alxlnet', pool_mode: str = 'last', **kwargs): """ Load alxlnet model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'alxlnet'`` - XLNET architecture from google + Malaya. pool_mode : str, optional (default='last') Model logits architecture supported. Allowed values: * ``'last'`` - last of the sequence. * ``'first'`` - first of the sequence. * ``'mean'`` - mean of the sequence. * ``'attn'`` - attention of the sequence. Returns ------- result : malaya.transformers.alxlnet.Model class """ model = model.lower() pool_mode = pool_mode.lower() from malaya.path import PATH_ALXLNET, S3_PATH_ALXLNET from malaya.function import check_file import sentencepiece as spm if pool_mode not in ['last', 'first', 'mean', 'attn']: raise Exception( "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']" ) check_file(PATH_ALXLNET[model]['model'], S3_PATH_ALXLNET[model], **kwargs) if not os.path.exists(PATH_ALXLNET[model]['directory']): import tarfile with tarfile.open(PATH_ALXLNET[model]['model']['model']) as tar: tar.extractall(path = PATH_ALXLNET[model]['path']) sp_model = spm.SentencePieceProcessor() sp_model.Load(PATH_ALXLNET[model]['directory'] + 'sp10m.cased.v9.model') xlnet_config = xlnet_lib.XLNetConfig( json_path = PATH_ALXLNET[model]['directory'] + 'config.json' ) xlnet_checkpoint = PATH_ALXLNET[model]['directory'] + 'model.ckpt' model = Model( xlnet_config, sp_model, xlnet_checkpoint, pool_mode = pool_mode, **kwargs ) model._saver.restore(model._sess, xlnet_checkpoint) return model
def transformer(path, s3_path, class_name, model='xlnet', quantized=False, **kwargs): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) try: with open(path[model]['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return TAGGING_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/dense/BiasAdd:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return TAGGING_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), vectorizer=g.get_tensor_by_name('import/transpose_3:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def transformer(model: str = 'xlnet', **kwargs): """ Load Transformer Constituency Parsing model, transfer learning Transformer + self attentive parsing. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. Returns ------- result : malaya.model.tf.CONSTITUENCY class """ model = model.lower() if model not in _transformer_availability: raise ValueError( 'model not supported, please check supported models from malaya.constituency.available_transformer()' ) check_file(PATH_CONSTITUENCY[model], S3_PATH_CONSTITUENCY[model], **kwargs) g = load_graph(PATH_CONSTITUENCY[model]['model'], **kwargs) with open(PATH_CONSTITUENCY[model]['dictionary']) as fopen: dictionary = json.load(fopen) if model in ['bert', 'tiny-bert', 'albert', 'tiny-albert']: tokenizer = sentencepiece_tokenizer_bert( PATH_CONSTITUENCY[model]['tokenizer'], PATH_CONSTITUENCY[model]['vocab'], ) mode = 'bert' if model in ['xlnet']: tokenizer = sentencepiece_tokenizer_xlnet( PATH_CONSTITUENCY[model]['tokenizer']) mode = 'xlnet' from malaya.model.tf import CONSTITUENCY return CONSTITUENCY( input_ids=g.get_tensor_by_name('import/input_ids:0'), word_end_mask=g.get_tensor_by_name('import/word_end_mask:0'), charts=g.get_tensor_by_name('import/charts:0'), tags=g.get_tensor_by_name('import/tags:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, dictionary=dictionary, mode=mode, )
def load(model: str = 'xlnet', pool_mode: str = 'last', **kwargs): """ Load xlnet model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'xlnet'`` - XLNET architecture from google. pool_mode : str, optional (default='last') Model logits architecture supported. Allowed values: * ``'last'`` - last of the sequence. * ``'first'`` - first of the sequence. * ``'mean'`` - mean of the sequence. * ``'attn'`` - attention of the sequence. Returns ------- result : malaya.transformers.xlnet.Model class """ model = model.lower() pool_mode = pool_mode.lower() if pool_mode not in ['last', 'first', 'mean', 'attn']: raise Exception( "pool_mode not supported, only support ['last', 'first', 'mean', 'attn']" ) check_file(PATH_XLNET[model]['model'], S3_PATH_XLNET[model], **kwargs) if not os.path.exists(PATH_XLNET[model]['directory'] + 'model.ckpt'): import tarfile with tarfile.open(PATH_XLNET[model]['model']['model']) as tar: tar.extractall(path=PATH_XLNET[model]['path']) vocab_model = PATH_XLNET[model]['directory'] + 'sp10m.cased.v9.model' vocab = PATH_XLNET[model]['directory'] + 'sp10m.cased.v9.vocab' tokenizer = SentencePieceTokenizer(vocab_file=vocab, spm_model_file=vocab_model) xlnet_config = xlnet_lib.XLNetConfig( json_path=PATH_XLNET[model]['directory'] + 'config.json' ) xlnet_checkpoint = PATH_XLNET[model]['directory'] + 'model.ckpt' model = Model( xlnet_config, tokenizer, xlnet_checkpoint, pool_mode=pool_mode, **kwargs ) model._saver.restore(model._sess, xlnet_checkpoint) return model
def load_news(): """ Return malaya pretrained local malaysia news word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector Returns ------- result: tuple(vocabulary, vector) """ check_file(PATH_WORDVECTOR['news'], S3_PATH_WORDVECTOR['news']) return _load(PATH_WORDVECTOR['news']['vocab'], PATH_WORDVECTOR['news']['model'])
def load_wiki_news_social_media(): """ Return malaya pretrained local malaysia Wikipedia + Social media + News word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector Returns ------- tuple: (vocabulary, vector) """ check_file(PATH_WORDVECTOR['combine'], S3_PATH_WORDVECTOR['combine']) return _load( PATH_WORDVECTOR['combine']['vocab'], PATH_WORDVECTOR['combine']['model'] )
def lexicon(**kwargs): """ Load Lexicon NSFW model. Returns ------- result : malaya.text.lexicon.nsfw.Lexicon class """ check_file(PATH_NSFW['lexicon'], S3_PATH_NSFW['lexicon'], **kwargs) with open(PATH_NSFW['lexicon']['model']) as fopen: corpus = json.load(fopen) return nsfw.Lexicon(corpus)
def load_news(): """ Return malaya pretrained local malaysia news word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector Returns ------- vocabulary: indices dictionary for `vector`. vector: np.array, 2D. """ check_file(PATH_WORDVECTOR['news'], S3_PATH_WORDVECTOR['news']) return _load( PATH_WORDVECTOR['news']['vocab'], PATH_WORDVECTOR['news']['model'] )
def load_wiki(): """ Return malaya pretrained wikipedia word2vec size 256. https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/wordvector Returns ------- tuple: (vocabulary, vector) """ check_file(PATH_WORDVECTOR['wikipedia'], S3_PATH_WORDVECTOR['wikipedia']) return _load( PATH_WORDVECTOR['wikipedia']['vocab'], PATH_WORDVECTOR['wikipedia']['model'], )