Exemple #1
0
    def make_fa_tokenize(lang_dir: Path) -> typing.Optional[TokenizeFunc]:
        """Tokenize Persian/Farsi"""
        try:
            import hazm
        except ImportError:
            _LOGGER.warning("hazm is highly recommended for language 'fa'")
            _LOGGER.warning("pip install 'hazm>=0.7.0'")
            return None

        normalizer = hazm.Normalizer()

        # Load part of speech tagger
        model_path = lang_dir / "postagger.model"

        if not model_path.is_file():
            _LOGGER.warning("Missing model: %s", model_path)
            return None

        _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path)
        tagger = hazm.POSTagger(model=str(model_path))

        def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]:
            """Normalize, tokenize, and recognize part of speech"""
            sentences_tokens = []
            sentences = hazm.sent_tokenize(normalizer.normalize(text))
            for sentence in sentences:
                sentence_tokens = []
                for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                    sentence_tokens.append(Token(text=word, pos=pos))

                sentences_tokens.append(sentence_tokens)

            return sentences_tokens

        return do_tokenize
Exemple #2
0
def stem_data(dat):
    normalizer = hazm.Normalizer()
    dat = normalizer.normalize(dat)
    sent = hazm.sent_tokenize(dat)

    words = []

    for s in sent:
        tagged = list(tagger.tag(hazm.word_tokenize(s)))
        new_tag = list(tagged)

        for token in tagged:
            if token[0] in stop_words:
                new_tag.remove(token)

        lemmatizer = hazm.Lemmatizer()
        for token in new_tag:

            stemmed = lemmatizer.lemmatize(token[0], pos=token[1])
            stemmer = hazm.Stemmer()
            stemmed = stemmer.stem(stemmed)
            if len(stemmed) > 0 and ('#' not in stemmed):
                words.append(stemmed)

    return words
 def preProcessing(self, doc, level=0):
     """
     This function remove punctuations and some useless prepositions and return a list of words.
     """
     junkList = [
         ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«",
         "ْ"
     ]
     junkWords = [
         "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا",
         "یا", "هر", "می", "بر"
     ]
     pronouns = [
         "من", "تو", "او", "ما", "شما", "ایشان", "آن‌ها", "این‌ها", "آن",
         "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه"
     ]
     for char in junkList:
         doc = doc.replace(char, " ")
     result = []
     doc = hazm.Normalizer().normalize(doc)
     doc = hazm.word_tokenize(doc)
     for word in doc:
         word.strip()
         if word not in junkWords and word not in pronouns:
             result.append(word)
     return result
def not_map_farsnet_kg_ontology():
    input_ontology_filename = DataUtils.join(Config.farsnet_ontology,
                                             Config.farsnet_ontology_filename)
    input_farsnet_map_ontology_filename = DataUtils.join(
        Config.farsnet_ontology, Config.farsnet_map_ontology_filename)
    output_farsnet_not_map_ontology_filename = DataUtils.join(
        Config.farsnet_ontology, Config.farsnet_not_map_ontology_filename)

    normalizer = hazm.Normalizer()
    flag_find = False
    item = 'word'
    with open(input_ontology_filename, 'r') as input_file_ontology, \
            open(output_farsnet_not_map_ontology_filename, 'a') as output_file:
        csv_reader_ontology, csv_writer = csv.reader(
            input_file_ontology), csv.writer(output_file)
        for line_ontology in csv_reader_ontology:
            if not flag_find:
                csv_writer.writerow([item])
                print(item)
            item = normalizer.normalize(line_ontology[0])
            flag_find = False
            with open(input_farsnet_map_ontology_filename,
                      'r') as input_file_map:
                csv_reader_graph = csv.reader(input_file_map)

                for line_map in csv_reader_graph:
                    if item == normalizer.normalize(line_map[1]):
                        flag_find = True
                        break
Exemple #5
0
def get_ambiguaty_abstract():
    abstract_filename = os.listdir(Config.extracted_texts_dir)
    input_ambiguate_word_filename = join(
        Config.article_names_dir, Config.farsnet_ambiguate_word_filename)
    output_ambiguate_abstract_filename = join(
        Config.article_names_dir, Config.farsnet_ambiguate_abstract_filename)

    temp_list = []
    count = 0
    max_number = 0
    min_number = 1000
    normalizer = hazm.Normalizer()
    with open(output_ambiguate_abstract_filename, 'w') as output_file:
        csv_writer = csv.writer(output_file)
        for filename in abstract_filename:
            # if count == 1:
            #     break;
            count += 1
            print('file ' + str(count) + ' is runing ' + filename)
            dict_abstract = DataUtils.load_json(Config.extracted_texts_dir,
                                                filename)
            for abstract_item in dict_abstract:
                with open(input_ambiguate_word_filename,
                          'r') as ambiguate_word:
                    csv_reader = csv.reader(ambiguate_word)

                    for line in csv_reader:
                        item = normalizer.normalize(line[1])
                        if item == abstract_item:
                            print('find ' + line[1] + ' in file.')
                            del temp_list[:]
                            temp_list.append(line[0])
                            temp_list.append(normalizer.normalize(line[1]))
                            temp_list.append(line[2])
                            temp_list.append(normalizer.normalize(line[3]))
                            temp_list.append(normalizer.normalize(line[4]))
                            temp_list.append(normalizer.normalize(line[5]))
                            temp_list.append(
                                normalizer.normalize(
                                    dict_abstract[abstract_item]))

                            sentence_snapshot = str(line[3]).replace(
                                ',', ' ').replace('،', ' ') + ' '
                            gloss_sentence = str(line[4]).replace(
                                ',', ' ').replace('،', ' ') + ' '
                            example = gloss = str(line[5]).replace(
                                ',', ' ').replace('،', ' ') + ' '
                            sentence1 = sentence_snapshot + gloss_sentence + example
                            sentence2 = str(temp_list[6]).replace(
                                ',', ' ').replace('،', ' ').replace('.', ' ')

                            diff = similar(sentence1, sentence2)
                            if diff > max_number:
                                max_number = diff
                            if diff < min_number:
                                min_number = diff
                            temp_list.append(diff)
                            csv_writer.writerow(temp_list)

    return [max_number, min_number]
Exemple #6
0
def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
                 fix_unicode=True,
                 to_ascii=False,
                 lower=True,
                 no_line_breaks=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=False,
                 no_digits=False,
                 no_currency_symbols=True,
                 no_punct=False,
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="0",
                 replace_with_currency_symbol="",
                 )

   # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               # u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("a", "ِ", text)
    return text
Exemple #7
0
    def make_fa_tokenize() -> TOKENIZE_FUNC:
        """Tokenize Persian/Farsi"""
        import hazm

        normalizer = hazm.Normalizer()

        # Load part of speech tagger
        model_path = _DATA_DIR / "fa" / "postagger.model"
        if not model_path.is_file():
            # Unzip
            model_gzip_path = Path(str(model_path) + ".gz")
            if model_gzip_path.is_file():
                _LOGGER.debug("Unzipping %s", model_gzip_path)
                with open(model_path, "wb") as out_file:
                    with gzip.open(model_gzip_path, "rb") as in_file:
                        shutil.copyfileobj(in_file, out_file)

        _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path)
        tagger = hazm.POSTagger(model=str(model_path))

        def do_tokenize(text: str) -> typing.List[typing.List[Token]]:
            """Normalize, tokenize, and recognize part of speech"""
            sentences_tokens = []
            sentences = hazm.sent_tokenize(normalizer.normalize(text))
            for sentence in sentences:
                sentence_tokens = []
                for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                    sentence_tokens.append(Token(text=word, pos=pos))

                sentences_tokens.append(sentence_tokens)

            return sentences_tokens

        return do_tokenize
def prepareText(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return words
Exemple #9
0
def countTextWords(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return len(words)
Exemple #10
0
 def __init__(self):
     self.preprocessed_docs = []
     self.normalizer = hazm.Normalizer()
     self.word_tokenizer = hazm.WordTokenizer()
     self.stemmer = hazm.Stemmer()
     self.stop_words = hazm.stopwords_list()
     self.persian_garbage = {
         u'÷': u'',
         u'ٰ': u'',
         u'،': ' ',
         u'؟': ' ',
         u'؛': '',
         u'َ': '',
         u'ُ': '',
         u'ِ': '',
         u'ّ': '',
         u'ٌ': '',
         u'ٍ': '',
         u'ئ': u'ی',
         u'ي': u'ی',
         u'ة': u'ه',
         u'ء': u'',
         u'ك': u'ک',
         u'ْ': u'',
         u'أ': u'ا',
         u'إ': u'ا',
         u'ؤ': u'و',
         u'×': u'',
         u'٪': u'',
         u'٬': u'',
         u'آ': u'ا',
         u'●': u''
     }
Exemple #11
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        # text = ''.join(char for char in unicodedata.normalize('NFD', text)
        #                if unicodedata.category(char) != 'Mn')  # Strip accents
        # text = re.sub("[^ a-z'.,?!\-]", "", text)

        normalizer = hazm.Normalizer()
        text = normalizer.normalize(text)
        # tokenization
        words = hazm.word_tokenize(text)
        # tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word in words:
            if not any(letter in word for letter in self.graphemes):
                pron = [word]

            # elif word in self.homograph2features:  # Check homograph
            #     pron1, pron2, pos1 = self.homograph2features[word]
            #     if pos.startswith(pos1):
            #         pron = pron1
            #     else:
            #         pron = pron2
            elif word in self.tihu:  # lookup tihu dict
                pron = self.tihu[word]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        return prons[:-1]
Exemple #12
0
def normalizing_validation_set():
    with open('data/valid.json', 'r', encoding='utf-8') as json_file:
        validation_data = json.load(json_file)

    with open('data/most_frequent_words.json', 'r',
              encoding='utf-8') as json_file:
        most_frequent_words = json.load(json_file)

    parsivar_normalizer = parsivar.Normalizer()
    hazm_normalizer = hazm.Normalizer()
    sentence_tokenizer = hazm.SentenceTokenizer()
    word_tokenizer = hazm.WordTokenizer(join_verb_parts=False)

    all_sentence_tokens = []
    for text in validation_data:
        text = parsivar_normalizer.sub_alphabets(text)
        text = hazm_normalizer.normalize(text)
        text = remove_english_characters(text)
        text = mask_numbers(text)
        text = remove_punctuations(text)
        text = remove_diacritics(text)
        text = remove_emojis(text)

        text = text.replace('\n', ' ')
        text = text.replace('?', '؟')
        text = text.replace('؟', ' ؟ ')
        text = text.replace('.', ' . ')
        text = text.replace('  ', ' ')
        sentences = sentence_tokenizer.tokenize(text)

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)

            if words[-1] == '.' or words[-1] == '؟':
                words = words[:-1]

            if len(words) == 0:
                continue

            final_sentence_tokens = []
            for ind, word in enumerate(words):
                if word == 'NUM':
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'NUM':
                        final_sentence_tokens.append(word)
                elif word not in most_frequent_words:
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'UNK':
                        final_sentence_tokens.append(word)
                else:
                    final_sentence_tokens.append(word)

            all_sentence_tokens.append(final_sentence_tokens)

    with open('data/validation_sentences.json', 'w') as json_file:
        json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
Exemple #13
0
def bagify(doc):
    normalizer = hazm.Normalizer()
    tokenize = hazm.word_tokenize
    word_list = re.sub(r"(&...;|&....;|(\d))|'|{|}|!", " ", doc)
    #stemmer = hazm.Stemmer()
    tokens = tokenize(normalizer.normalize(word_list))
    #tokens = [stemmer.stem(x) for x in tokens ]
    doc_list = [x for x in tokens if x not in stop_words]
    doc_set = set(doc_list)
    doc_bag = Counter({k:doc_list.count(k) for k in doc_set})
    return doc_bag
Exemple #14
0
def pre_process_data():
    f_1 = open("../../Data/label1.txt", "r", encoding="utf-8")
    f1 =f_1.readlines()
    f1_preproc = open("../label1.txt", "w+", encoding="utf-8")
    for j in f1:
        normalized_f1 = Hazm1.Normalizer().normalize(j)
        tokenized_f1 = Hazm1.word_tokenize(normalized_f1)
        for i in tokenized_f1:
            f1_preproc.write(i)
            f1_preproc.write(' ')
        f1_preproc.write('\n')

    f_2 = open("../../Data/label2.txt", "r", encoding="utf-8")
    f2 = f_2.readlines()
    f2_preproc = open("../label2.txt", "w+", encoding="utf-8")
    for j in f2:
        normalized_f2 = Hazm2.Normalizer().normalize(j)
        tokenized_f2 = Hazm2.word_tokenize(normalized_f2)
        for i in tokenized_f2:
            f2_preproc.write(i)
            f2_preproc.write(' ')
        f2_preproc.write('\n')
def textNormalizer(lousyCollection):
    docs = list()
    normalizer = hz.Normalizer()
    lemmatizer = hz.Lemmatizer()
    stemmer = hz.Stemmer()
    for i in range(len(lousyCollection)):
        normalized = normalizer.normalize(lousyCollection[i])
        docs.append(delete_Punc(normalized))
    for doc in docs:
        tokens = hz.word_tokenize(doc)
        for token in tokens:
            tokens[tokens.index(token)] = lemmatizer.lemmatize(
                stemmer.stem(token))
        docs[docs.index(doc)] = tokens
    return docs
Exemple #16
0
    def __init__(self, feature_set, orientations=None, language='english'):
        self.language = language
        self.normalizer[language] = hazm.Normalizer()
        if language == 'persian':
            self.stopwords[language] = hazm.stopwords_list()
            self.regex_words[language] = r"[\w']+|[.,!?;،؟؛]"
        else:
            self.stopwords[language] = set(stopwords.words('english'))
            self.regex_words[language] = r"[\w']+|[.,!?;]"

        if orientations:
            self.orientations = orientations

        self.feature_set = feature_set
        self.weights = {}
        self.hash_dictionary[self.language] = {}
Exemple #17
0
    def text_normalazation(self, raw_text):

        normalizer = hazm.Normalizer()
        clean_text = normalizer.normalize(raw_text)
        # clean_text = re.sub ( r'\n\s*\n', '\n' , clean_text )
        # clean_text = re.sub ( r'\r\s*\r', '\n', clean_text )
        clean_text = raw_text.replace('\n', ' ').replace('\r', '')
        clean_text = re.sub(' +', ' ', clean_text)
        # clean_text = re.compile ( '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});' )
        clean_text = clean_text.replace(u'\xa0', u' ')
        # clean_text = re.sub ( clean_text , ' ' , raw_text )
        # clean_text = ' '.join(clean_text.split())

        # clean_text = "\n".join(clean_text.split("\n"))

        return (clean_text)
Exemple #18
0
def similar(s1, s2):
    normalizer = hazm.Normalizer()
    s1 = normalizer.normalize(s1)
    s2 = normalizer.normalize(s2)

    list_s1 = [
        word for word in s1.split(" ") if word not in hazm.stopwords_list()
    ]
    list_s2 = [
        word for word in s2.split(" ") if word not in hazm.stopwords_list()
    ]

    stemmer = hazm.Stemmer()
    stem_s1 = [stemmer.stem(word) for word in list_s1]

    same_words = set.intersection(set(list_s1), set(list_s2))
    return len(same_words)
 def preProcessingVW(self, doc):
     junkList = [
         ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«",
         "ْ"
     ]
     junkWords = [
         "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا",
         "یا", "هر", "می", "بر"
     ]
     pronouns = [
         "من", "تو", "او", "ما", "شما", "ایشان", "آن‌ها", "این‌ها", "آن",
         "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه"
     ]
     for char in junkList:
         doc = doc.replace(char, "")
     doc.strip()
     doc = hazm.Normalizer().normalize(doc)
     return doc
Exemple #20
0
    def __call__(self, text, tidy=False, secret=False):

        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        # text = ''.join(char for char in unicodedata.normalize('NFD', text)
        #                if unicodedata.category(char) != 'Mn')  # Strip accents
        # text = re.sub("[^ a-z'.,?!\-]", "", text)

        normalizer = hazm.Normalizer()
        text = normalizer.normalize(text)
        # tokenization
        words = hazm.word_tokenize(text)
        # tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word in words:

            if not any(letter in word for letter in self.graphemes):
                pron = [word]

            # elif word in self.homograph2features:  # Check homograph
            #     pron1, pron2, pos1 = self.homograph2features[word]
            #     if pos.startswith(pos1):
            #         pron = pron1
            #     else:
            #         pron = pron2
            elif word in self.tihu:  # lookup tihu dict
                pron = [self.tihu[word].replace(' ', '')
                        ] if secret else [' ', self.tihu[word], ' ']
            else:  # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        result = ''.join(prons[:-1])

        if tidy:
            return Persian_g2p_converter.convert_from_native_to_good(result)

        return result
 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')
Exemple #22
0
 def word_counter(self, text: str) -> (float, dict):
     text = text.lower()
     text = text.translate(str.maketrans(
         {'#': ' ', '$': ' ', '/': ' ', '+': ' ', '=': ' ', ':': ' ', ',': ' ', ';': ' ', '؛': ' ', '،': ' ',
          '.': ' ', '!': ' ', '؟': ' ', '?': ' ', '«': ' ', '»': ' ', '(': ' ', ')': ' ', '_': ' ', '-': ' ',
          '@': ' '}))
     text = hazm.Normalizer().normalize(text)
     text = hazm.word_tokenize(text)
     stemmer = hazm.Stemmer()
     keywords_dic = {word: 0 for word in self.keywords.keys()}
     value = 0.0
     for i in range(len(text)):
         stemmed_word = stemmer.stem(text[i])
         if stemmed_word in keywords_dic:
             keywords_dic[stemmed_word] += 1
             if keywords_dic[stemmed_word] == 1:  # count each word only once
                 value += self.keywords[stemmed_word]
         if stemmed_word in self.filter_words:
             return 0, {}
     return value, keywords_dic
Exemple #23
0
    def text_to_tokens(
        self, text: str
    ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]:
        """
        Process text into words and sentence tokens using hazm.

        Returns: (original_words, sentence_tokens) for each sentence
        """

        try:
            import hazm
        except ImportError:
            _LOGGER.warning("hazm is highly recommended for language 'fa'")
            _LOGGER.warning("pip install 'hazm>=0.7.0'")

            # Fall back to parent implementation
            yield from super().text_to_tokens(text)

        # Load normalizer
        if not hasattr(self, "normalizer"):
            normalizer = hazm.Normalizer()
            setattr(self, "normalizer", normalizer)

        # Load tagger
        if not hasattr(self, "tagger"):
            # Load part of speech tagger
            model_path = self.lang_dir / "postagger.model"
            tagger = hazm.POSTagger(model=str(model_path))
            setattr(self, "tagger", tagger)

        sentences = hazm.sent_tokenize(normalizer.normalize(text))
        for sentence in sentences:
            original_words = []
            sentence_tokens = []
            for word, pos in tagger.tag(hazm.word_tokenize(sentence)):
                original_words.append(word)
                sentence_tokens.append(
                    Token(text=word,
                          features={TokenFeatures.PART_OF_SPEECH: pos}))

            yield original_words, sentence_tokens
def map_farsnet_kg_ontology(input_filename):
    input_ontology_filename = DataUtils.join(Config.farsnet_ontology,
                                             Config.farsnet_ontology_filename)
    output_farsnet_map_ontology_filename = DataUtils.join(
        Config.farsnet_ontology, Config.farsnet_map_ontology_filename)

    normalizer = hazm.Normalizer()
    print('input file ' + input_filename)

    with open(input_ontology_filename, 'r') as input_file_ontology, \
            open(output_farsnet_map_ontology_filename, 'a') as output_file:
        csv_reader_ontology, csv_writer = csv.reader(
            input_file_ontology), csv.writer(output_file)
        for line_ontology in csv_reader_ontology:
            with open(input_filename, 'r') as input_file_graph:
                csv_reader_graph = csv.reader(input_file_graph)
                for line_graph in csv_reader_graph:
                    item = normalizer.normalize(line_graph[1])
                    if normalizer.normalize(line_ontology[0]) == item:
                        print(item)
                        csv_writer.writerow(
                            [line_graph[0], item, line_graph[3]])
def preprocess_farsi(text):
    prohibitedWords = ['[[', ']]', '{{', '}}', '{|', '|', '*', '==', '=', '\'\'\'' ,'_']
    big_regex = re.compile('|'.join(map(re.escape, prohibitedWords)))
    new_text = big_regex.sub(" ", text)
    # print(new_text)
    ### Remove English characters
    new_text = re.sub(r'[a-zA-Z]','', new_text)
    ### Remove punctuation
    new_text = re.sub(r'[^\w\s]', ' ', new_text)
    normalizer = hazm.Normalizer(remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, affix_spacing=True, token_based=False, punctuation_spacing=True)
    new_text = normalizer.normalize(new_text)
    ### Remove numbers
    new_text = re.sub(r'[۱۲۳۴۵۶۷۸۹۰]', ' ', new_text)
    ### Not in HAZM
    # new_text = new_text.replace('گی','ه')
    tokens = hazm.word_tokenize(new_text)
    stemmer = hazm.Stemmer()

    tokens = [word.replace('\u200c', '‌') for word in tokens ]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [word for word in tokens if word != '' ]
    return tokens
import hazm
from cleantext import clean
import re

normalizer = hazm.Normalizer()


def cleanize(text):
    """ Clean the text from redundant and useless items """
    text = clean(
        text,
        fix_unicode=True,
        to_ascii=False,
        lower=False,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit=".",
        replace_with_currency_symbol=""
    )
    text = text.strip()
    text = re.sub("\s+", " ", text)
Exemple #27
0
def CleanPersianText(text):
    _normalizer = hazm.Normalizer()
    text = _normalizer.normalize(text)
    return text
Exemple #28
0
# coding: utf-8

# Modules
import hazm as hz
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import xml.etree.ElementTree as et
from os import listdir
from os.path import isfile, join
from collections import defaultdict

# Parameters
normalizer = hz.Normalizer()
tagger = hz.POSTagger(model='resources/postagger.model')
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()

lexicon_file_name = 'final_lexi'
data_path = './data/'

lexicon = None


# Make bag_of_words
def bow(text):
    global normalizer
    global tagger
    global stemmer
    global lemmatizer
Exemple #29
0
 def normalize(self, input):
     return hazm.Normalizer().normalize(input)
Exemple #30
0
def normalize_text(text):
    normalizer = hazm.Normalizer()
    normalized_text = normalizer.normalize(text)
    return normalized_text