Example #1
0
    def make_fa_tokenize(lang_dir: Path) -> typing.Optional[TokenizeFunc]:
        """Tokenize Persian/Farsi"""
        try:
            import hazm
        except ImportError:
            _LOGGER.warning("hazm is highly recommended for language 'fa'")
            _LOGGER.warning("pip install 'hazm>=0.7.0'")
            return None

        normalizer = hazm.Normalizer()

        # Load part of speech tagger
        model_path = lang_dir / "postagger.model"

        if not model_path.is_file():
            _LOGGER.warning("Missing model: %s", model_path)
            return None

        _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path)
        tagger = hazm.POSTagger(model=str(model_path))

        def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]:
            """Normalize, tokenize, and recognize part of speech"""
            sentences_tokens = []
            sentences = hazm.sent_tokenize(normalizer.normalize(text))
            for sentence in sentences:
                sentence_tokens = []
                for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                    sentence_tokens.append(Token(text=word, pos=pos))

                sentences_tokens.append(sentence_tokens)

            return sentences_tokens

        return do_tokenize
Example #2
0
    def make_fa_tokenize() -> TOKENIZE_FUNC:
        """Tokenize Persian/Farsi"""
        import hazm

        normalizer = hazm.Normalizer()

        # Load part of speech tagger
        model_path = _DATA_DIR / "fa" / "postagger.model"
        if not model_path.is_file():
            # Unzip
            model_gzip_path = Path(str(model_path) + ".gz")
            if model_gzip_path.is_file():
                _LOGGER.debug("Unzipping %s", model_gzip_path)
                with open(model_path, "wb") as out_file:
                    with gzip.open(model_gzip_path, "rb") as in_file:
                        shutil.copyfileobj(in_file, out_file)

        _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path)
        tagger = hazm.POSTagger(model=str(model_path))

        def do_tokenize(text: str) -> typing.List[typing.List[Token]]:
            """Normalize, tokenize, and recognize part of speech"""
            sentences_tokens = []
            sentences = hazm.sent_tokenize(normalizer.normalize(text))
            for sentence in sentences:
                sentence_tokens = []
                for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                    sentence_tokens.append(Token(text=word, pos=pos))

                sentences_tokens.append(sentence_tokens)

            return sentences_tokens

        return do_tokenize
def get_grammatical_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        tagger = hazm.POSTagger(model=data_path.POSTAGGER_MODEL_PATH)
        tags = tagger.tag(hazm.word_tokenize(normalized_text))
        tags_list = [i[1] for i in tags]

        sounds = [utils.normalize_text(sound) for sound in config.sounds]
        group_pros = [
            utils.normalize_text(group_pro) for group_pro in config.group_pro
        ]
        conjunctions = [
            utils.normalize_text(conjunction)
            for conjunction in config.conjunctions
        ]
        subjective_pronounces = [
            utils.normalize_text(subjective_pronounce)
            for subjective_pronounce in config.subjective_pronounce
        ]

        feature['GRM_F30'] = utils.count_chars(text, subjective_pronounces)

        feature['GRM_F31'] = utils.count_chars(text, config.question)

        feature['GRM_F32'] = utils.count_chars(text, conjunctions)

        feature['GRM_F33'] = utils.count_chars(text, group_pros)

        feature['GRM_F34'] = utils.count_chars(text, sounds)

        feature['GRM_F35'] = tags_list.count('P') + tags_list.count('POSTP')

        feature['GRM_F40'] = tags_list.count('AJ')
        feature['GRM_F41'] = tags_list.count('ADV')
        feature['GRM_F42'] = tags_list.count('PRO')
        feature['GRM_F51'] = tags_list.count('NUM')

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Example #4
0
    def text_to_tokens(
        self, text: str
    ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]:
        """
        Process text into words and sentence tokens using hazm.

        Returns: (original_words, sentence_tokens) for each sentence
        """

        try:
            import hazm
        except ImportError:
            _LOGGER.warning("hazm is highly recommended for language 'fa'")
            _LOGGER.warning("pip install 'hazm>=0.7.0'")

            # Fall back to parent implementation
            yield from super().text_to_tokens(text)

        # Load normalizer
        if not hasattr(self, "normalizer"):
            normalizer = hazm.Normalizer()
            setattr(self, "normalizer", normalizer)

        # Load tagger
        if not hasattr(self, "tagger"):
            # Load part of speech tagger
            model_path = self.lang_dir / "postagger.model"
            tagger = hazm.POSTagger(model=str(model_path))
            setattr(self, "tagger", tagger)

        sentences = hazm.sent_tokenize(normalizer.normalize(text))
        for sentence in sentences:
            original_words = []
            sentence_tokens = []
            for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                original_words.append(word)
                sentence_tokens.append(
                    Token(text=word,
                          features={TokenFeatures.PART_OF_SPEECH: pos}))

            yield original_words, sentence_tokens
Example #5
0
# coding: utf-8

# Modules
import hazm as hz
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import xml.etree.ElementTree as et
from os import listdir
from os.path import isfile, join
from collections import defaultdict

# Parameters
normalizer = hz.Normalizer()
tagger = hz.POSTagger(model='resources/postagger.model')
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()

lexicon_file_name = 'final_lexi'
data_path = './data/'

lexicon = None


# Make bag_of_words
def bow(text):
    global normalizer
    global tagger
    global stemmer
    global lemmatizer
Example #6
0
 def __init__(self, train_input_path, test_input_path):
     self.pos_tag = hazm.POSTagger(
         model="../data/Hazm_resources/resources-0.5/postagger.model")
     self.train_input_path = train_input_path
     self.test_input_path = test_input_path