def make_fa_tokenize(lang_dir: Path) -> typing.Optional[TokenizeFunc]: """Tokenize Persian/Farsi""" try: import hazm except ImportError: _LOGGER.warning("hazm is highly recommended for language 'fa'") _LOGGER.warning("pip install 'hazm>=0.7.0'") return None normalizer = hazm.Normalizer() # Load part of speech tagger model_path = lang_dir / "postagger.model" if not model_path.is_file(): _LOGGER.warning("Missing model: %s", model_path) return None _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path) tagger = hazm.POSTagger(model=str(model_path)) def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]: """Normalize, tokenize, and recognize part of speech""" sentences_tokens = [] sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): sentence_tokens.append(Token(text=word, pos=pos)) sentences_tokens.append(sentence_tokens) return sentences_tokens return do_tokenize
def make_fa_tokenize() -> TOKENIZE_FUNC: """Tokenize Persian/Farsi""" import hazm normalizer = hazm.Normalizer() # Load part of speech tagger model_path = _DATA_DIR / "fa" / "postagger.model" if not model_path.is_file(): # Unzip model_gzip_path = Path(str(model_path) + ".gz") if model_gzip_path.is_file(): _LOGGER.debug("Unzipping %s", model_gzip_path) with open(model_path, "wb") as out_file: with gzip.open(model_gzip_path, "rb") as in_file: shutil.copyfileobj(in_file, out_file) _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path) tagger = hazm.POSTagger(model=str(model_path)) def do_tokenize(text: str) -> typing.List[typing.List[Token]]: """Normalize, tokenize, and recognize part of speech""" sentences_tokens = [] sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): sentence_tokens.append(Token(text=word, pos=pos)) sentences_tokens.append(sentence_tokens) return sentences_tokens return do_tokenize
def get_grammatical_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) tagger = hazm.POSTagger(model=data_path.POSTAGGER_MODEL_PATH) tags = tagger.tag(hazm.word_tokenize(normalized_text)) tags_list = [i[1] for i in tags] sounds = [utils.normalize_text(sound) for sound in config.sounds] group_pros = [ utils.normalize_text(group_pro) for group_pro in config.group_pro ] conjunctions = [ utils.normalize_text(conjunction) for conjunction in config.conjunctions ] subjective_pronounces = [ utils.normalize_text(subjective_pronounce) for subjective_pronounce in config.subjective_pronounce ] feature['GRM_F30'] = utils.count_chars(text, subjective_pronounces) feature['GRM_F31'] = utils.count_chars(text, config.question) feature['GRM_F32'] = utils.count_chars(text, conjunctions) feature['GRM_F33'] = utils.count_chars(text, group_pros) feature['GRM_F34'] = utils.count_chars(text, sounds) feature['GRM_F35'] = tags_list.count('P') + tags_list.count('POSTP') feature['GRM_F40'] = tags_list.count('AJ') feature['GRM_F41'] = tags_list.count('ADV') feature['GRM_F42'] = tags_list.count('PRO') feature['GRM_F51'] = tags_list.count('NUM') feature['number'] = key features.append(feature) return pd.DataFrame(features)
def text_to_tokens( self, text: str ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]: """ Process text into words and sentence tokens using hazm. Returns: (original_words, sentence_tokens) for each sentence """ try: import hazm except ImportError: _LOGGER.warning("hazm is highly recommended for language 'fa'") _LOGGER.warning("pip install 'hazm>=0.7.0'") # Fall back to parent implementation yield from super().text_to_tokens(text) # Load normalizer if not hasattr(self, "normalizer"): normalizer = hazm.Normalizer() setattr(self, "normalizer", normalizer) # Load tagger if not hasattr(self, "tagger"): # Load part of speech tagger model_path = self.lang_dir / "postagger.model" tagger = hazm.POSTagger(model=str(model_path)) setattr(self, "tagger", tagger) sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: original_words = [] sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): original_words.append(word) sentence_tokens.append( Token(text=word, features={TokenFeatures.PART_OF_SPEECH: pos})) yield original_words, sentence_tokens
# coding: utf-8 # Modules import hazm as hz import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Dropout, Activation import xml.etree.ElementTree as et from os import listdir from os.path import isfile, join from collections import defaultdict # Parameters normalizer = hz.Normalizer() tagger = hz.POSTagger(model='resources/postagger.model') stemmer = hz.Stemmer() lemmatizer = hz.Lemmatizer() lexicon_file_name = 'final_lexi' data_path = './data/' lexicon = None # Make bag_of_words def bow(text): global normalizer global tagger global stemmer global lemmatizer
def __init__(self, train_input_path, test_input_path): self.pos_tag = hazm.POSTagger( model="../data/Hazm_resources/resources-0.5/postagger.model") self.train_input_path = train_input_path self.test_input_path = test_input_path