コード例 #1
0
def prepare():
    normalizer = Normalizer()
    stemmer = Stemmer()

    string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟'''

    tokenizer = WordTokenizer(join_verb_parts=True,
                              separate_emoji=True,
                              replace_links=True,
                              replace_IDs=True,
                              replace_emails=True,
                              replace_numbers=True,
                              replace_hashtags=True)

    labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'}
    normal_string = normalizer.normalize(string)
    for label in labels.keys():
        print(normal_string.find(label))

    exit(0)
    for i, sent in enumerate([1, 2, 3, 4]):
        entities = []
        (10, 15, 'PrdName')
        for label in labels.keys():
            print(f'{label} in {i}', label in sent)
        record = (sent, {'entities': entities})

        print()
コード例 #2
0
    def score(self, sentences):
        # Predict
        pos, neg, neu = 0, 0, 0
        stemmer = Stemmer()
        classifier = self.__get_model()
        normalizer = Normalizer()

        sentences = sent_tokenize(sentences)

        for sentence in sentences:
            sentence = normalizer.normalize(sentence)
            words = word_tokenize(sentence)

            for word in words:
                stemmer.stem(word)
                class_result = classifier.classify(self.__word_feats(word))
                if class_result == 'neg':
                    neg = neg + 1
                if class_result == 'pos':
                    pos = pos + 1
                if class_result == 'neu':
                    neu = neu + 1

        positive_sentiment = str(float(pos) / len(words))
        # print('Positive: ' + positive_sentiment)
        neutral_sentiment = str(float(neu) / len(words))
        # print('Neutral: ' + neutral_sentiment)
        negative_sentiment = str(-float(neg) / len(words))
        # print('Negative: ' + negative_sentiment)

        total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2
        # print('Total (Avg): ' + str(total_sentiment))

        return total_sentiment
コード例 #3
0
    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
コード例 #4
0
def prepare_line(line):
    global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations
    if normalizer is None:
        normalizer = Normalizer()
        incorrect, correct = CorrectCodings.loadCodings("TableCodings.txt")

    line = normalizer.normalize(line)
    line = CorrectCodings.CorrectCodingInLine(line, incorrect, correct)
    # remove prefix
    pat = re.compile(r"https?(.)*[^\s]+")
    line = re.sub(pat, r" ", line)

    pat = re.compile(r"\\n")
    line = re.sub(pat, "\n", line)
    pat = re.compile(r"([^\sا-ی۰-۹a-zA-Z\d])")
    line = re.sub(pat, r" \1 ", line)

    for p in punctuations:
        pat = re.compile(r"([" + punct_str + "])")
        line = re.sub(pat, r" \1 ", line)

    pat = re.compile(r"([" + digits + "]+)")
    line = re.sub(pat, r" \1 ", line)
    pat = re.compile(r" +")
    line = re.sub(pat, r" ", line)
    pat = re.compile(r"\n+")
    line = re.sub(pat, r" \n ", line)
    pat = re.compile("[" + whitespace_chars + "]+")
    line = re.sub(pat, r" ", line)
    line = line.strip()
    return line
コード例 #5
0
    def handle(self, *args, **options):
        articles = Article.objects.filter(is_vectorized=False)

        N = Normalizer()
        FT = fasttext.load_model(options['path'])

        index = 0
        for article in articles:
            try:
                if index % 100 == 0:
                    print(index)
                text = N.normalize(article.text)
                text = text.translate(str.maketrans('', '', punctuation))
                text = text.split()
                text = [word for word in text if len(word) > 2]
                vector = nan_to_num(
                    mean([FT.get_word_vector(w) for w in text], axis=0))
                vector = vector / (vector.dot(vector))**0.5
                obj = ArticleVector(article=article, embedding=vector.tolist())
                obj.save()
                article.is_vectorized = True
                article.save()
                index += 1
            except Exception as e:
                print(e)
コード例 #6
0
ファイル: pos.py プロジェクト: salehmontazeran/nlp-project
 def __init__(self, inFile, outFile):
     self.inFile = inFile
     self.outFile = outFile
     self.normalizer = Normalizer()
     self.tagger = POSTagger(model='resources/postagger.model')
     self.lemmatizer = Lemmatizer()
     self.stemmer = Stemmer()
コード例 #7
0
def clean_persianText(txt):
    normalizer = Normalizer()
    txt = normalizer.character_refinement(txt)
    txt = normalizer.affix_spacing(txt)
    txt = normalizer.punctuation_spacing(txt)
    txt = txt.replace('.', '')
    txt = normalizer.normalize(txt)
    return txt
コード例 #8
0
    def __init__(self, path_dataset, path_stopwords):

        self.path_dataset = path_dataset

        self.path_stopwords = path_stopwords

        self.stopwords = self.file_reader(self.path_stopwords)

        self.normalizer = Normalizer()
コード例 #9
0
def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement
コード例 #10
0
    def bigram_cleaner(text):
        text = re.sub(Text_cleaner.persian_regex, ' ', text)
        text = re.sub('[ ]+', ' ', text)

        normalizer = Normalizer()
        text = normalizer.normalize(text)

        tokenized = word_tokenize(text)
        return tokenized
コード例 #11
0
    def __init__(self):
        # persian words normalizer
        self.normalizer = Normalizer()

        # load stopwords
        logger.info(f"Loading stopwords from {DATA_DIR / 'stopwords.txt'}")
        stop_words = open(DATA_DIR / 'stopwords.txt').readlines()
        stop_words = map(str.strip, stop_words)
        self.stop_words = set(map(self.normalizer.normalize, stop_words))
コード例 #12
0
 def normalize(self):
     """
     :return:
     """
     normalizer = Normalizer()
     for line in self.data.split('\n'):
         if line != "":
             self.normalize_text.append(normalizer.normalize(line))
     return self.normalize_text
コード例 #13
0
ファイル: find_mul.py プロジェクト: mraarabzadeh/internship
def tokenize(paragraph, wanted_list):
    normal = Normalizer(remove_extra_spaces=True,
                        punctuation_spacing=True,
                        persian_style=False,
                        persian_numbers=False,
                        remove_diacritics=False,
                        affix_spacing=False,
                        token_based=False)
    for sentence in sent_tokenize(normal.normalize(paragraph)):
        wanted_list.append(sentence)
コード例 #14
0
ファイル: WordCloudFa.py プロジェクト: sjmars/word_cloud_fa
 def normalize_words(words: Iterable) -> List[str]:
     """
     This method gets an Iterable containing some Farsi words as elements, normalizes them using Hazm and then
     returns a list of normalized words.
     :param words: an iterable including words
     :return: A list of normalized elements of the `words` iterable.
     """
     combined_words: str = "".join(x + "\n" for x in words)
     normalizer: Normalizer = Normalizer()
     normalized_combined_words: str = normalizer.normalize(combined_words)
     return normalized_combined_words.split("\n")
コード例 #15
0
ファイル: tut.py プロジェクト: mhbashari/pervis
def test_word_visualization(model_path, some_words):
    normalizer = Normalizer()
    model = word2vec.Word2Vec.load(model_path)
    vectors = [
        model[normalizer.normalize(word)] for word in some_words
        if normalizer.normalize(word) in model.vocab.keys()
    ]
    # print(model[normalizer.normalize('فرهنگ')])
    # print(model.similarity('فرهنگ', 'تمدن'))
    # print(vectors)
    rd = W2VPersianVis(model_path, selected_words=some_words)
    rd.show_plot()
コード例 #16
0
 def preprocess(self, cm):
     cm = ''.join([c for c in str(cm) if c not in punctuation])
     cm = self._numbers_to_english(cm)
     cm = re.sub(r"[0-9]", '', cm)
     cm = cm.replace('\u200c', ' ').replace('\n',
                                            '').replace('\r', '').replace(
                                                'ي', 'ی').replace('ك', 'ک')
     normalizer = Normalizer()
     cm = normalizer.normalize(cm)
     tokens = word_tokenize(cm)
     cm = ' '.join([x for x in tokens if x not in self.stopwords])
     return cm
コード例 #17
0
def tokenizer(input_var):
    tokenized = []
    normalizer1 = Normalizer(True, False, False)
    normalizer2 = Normalizer(False, True, False)
    normalizer3 = Normalizer(False, False, True)
    word_tokenizer = WordTokenizer(False)
    input_var = normalizer1.normalize(
        normalizer2.normalize(normalizer3.normalize(input_var)))
    actual = word_tokenizer.tokenize(input_var)
    lemmatizer = Lemmatizer()

    # stemmer = Stemmer

    for x in actual:
        # print(x);
        s = lemmatizer.lemmatize(x)
        if "#" in s and s.split("#")[0] != "":
            tokenized.append(s.split("#")[0] + "ن")
        else:
            tokenized.append(s.replace("#", ""))
    return tokenized
コード例 #18
0
def pipeline_sentence(sentence, model, tokenizer):
    sentence = change_words(sentence)

    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence_lem = ' '.join([
        Lemmatizer().lemmatize(x)
        for x in word_tokenize(normalizer.normalize(sentence))
    ])
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    sentence_ner = nlp(sentence)
    sentence_ner_lem = nlp(sentence_lem)
    return sentence_ner, sentence_ner_lem, sentence_lem, sentence
コード例 #19
0
def preprocess(doc):
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    normalizer = Normalizer()
    doc = normalizer.normalize(doc)
    tokenized = re.split(' |-', doc)
    for w in tokenized[:]:
        if w in stopwords:
            tokenized.remove(w)
    stemmed = [stemmer.stem(w) for w in tokenized]
    new_words = [word for word in stemmed if word.isalnum()]
    lemmatized = [lemmatizer.lemmatize(w) for w in new_words]
    return lemmatized
コード例 #20
0
def dataset_cleaner(dataset):
    statements = []
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    for i in range(len(dataset)):
        normalized_statement = normalizer.normalize(dataset[i])
        # for sentence in sent_tokenize(dataset[i]):
        word_list = [
            lemmatizer.lemmatize(word)
            for word in word_tokenize(normalized_statement)
            if word not in stops
        ]
        statements.append(word_list)
    return statements
コード例 #21
0
def clean(sentence):
    #trim digits
    ind = 0
    for i in range(len(sentence)):
        if (sentence[i] in FARSI_DIGITS or sentence[i] in ENGLISH_DIGITS):
            ind += 1
        else:
            break
    sentence = sentence[ind:]

    #remove Non-Alphanumeric
    res = []
    for i in range(len(sentence)):
        if (sentence[i] in FARSI_ALPHABET or sentence[i] in FARSI_DIGITS):
            res.append(sentence[i])
    sentence = "".join(res)
    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)

    return sentence
コード例 #22
0
def process_text(text):
    normalize=Normalizer()
    text=normalize.normalize(text)
    text = text.replace("_", " ")
    text = text.replace(',', ' ')
    text=text.replace("\u220c","")
    text=text.replace("\u200c","")
    text=text.replace("-","")
    # text = text.replace('/', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('.', ' ')
    text=text.replace("،"," ")
    text=text.replace("«"," ")
    text=text.replace("»"," ")
    # Convert text string to a list of words
    t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text)  # just split word by space to space and omit other thing
    lemma=Lemmatizer()
    text=[lemma.lemmatize(x) for x in t]
    return text
コード例 #23
0
def prepare_line(line):
    global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations
    if normalizer is None:
        normalizer = Normalizer()
        incorrect, correct = loadCodings("TableCodings.txt")

    line = normalizer.normalize(line)
    line = CorrectCodingInLine(line, incorrect, correct)

    pat = re.compile(r"([" + re.escape(punct_str) + "])")
    line = re.sub(pat, r" \1 ", line)

    pat = re.compile(r"([" + digits + "]+)")
    line = re.sub(pat, r" \1 ", line)
    pat = re.compile(r"\n+")
    line = re.sub(pat, r" \n ", line)
    pat = re.compile("[" + whitespace_chars + "]+")
    line = re.sub(pat, r" ", line)
    line = line.strip()
    return line
コード例 #24
0
    def prepare_text(text, should_stem=True):
        normalizer = Normalizer()
        text = normalizer.normalize(text)
        tokenized = word_tokenize(text)

        #نگارشی
        def fix_word(w):
            #            for c in Text_cleaner.punct_list:
            #                w = w.replace(c, '')
            w = re.sub(Text_cleaner.punct_regex, '', w).replace('،', '')
            return "$" if w == "" else w

        punc_free = list(filter(lambda x: x != '$', map(fix_word, tokenized)))
        stemmer = Stemmer()
        if should_stem:
            stemmed_list = list(
                filter(lambda x: x != '', map(stemmer.stem, punc_free)))
        else:
            stemmed_list = punc_free

        return stemmed_list
コード例 #25
0
def clean_tweet(tweet):
    tweet = str(tweet)
    tweet = tweet.lower()
    # remove # so we preserve hashtags for the cloud
    tweet = tweet.replace("#", "")
    tweet = remove_links(tweet)
    tweet = remove_mentions(tweet)
    tweet = remove_emoji(tweet)
    tweet = remove_punctuations(tweet)
    tweet = remove_reserved_words(tweet)
    normalizer = Normalizer()
    tweet = normalizer.normalize(tweet)
    # replace arabic ي with persian
    tweet = tweet.replace('ي', 'ی')
    # removes verbs such as می‌شود or نمی‌گویند
    tweet = re.sub(r'ن?می[‌]\S+', '', tweet)
    tokens = word_tokenize(tweet)
    tokens = [token for token in tokens if not token.isdigit()]
    tokens = [token for token in tokens if token not in stopwords.persian]
    tokens = [token for token in tokens if token not in stopwords.english]
    return " ".join(tokens).strip()
コード例 #26
0
ファイル: WordCloudFa.py プロジェクト: sjmars/word_cloud_fa
    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        Attention: this method will not remove stopwords from the input.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.remove_unhandled_utf_characters:
            text = WordCloudFa.unhandled_characters_regex.sub(r'', text)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function.
            word_counts = unigrams_and_bigrams(words, [],
                                               self.normalize_plurals,
                                               self.collocation_threshold)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
コード例 #27
0
def tok(dataTok):
    normalizer = Normalizer()
    tokenizer = WordTokenizer(join_verb_parts=False,
                              replace_links=True,
                              replace_IDs=True,
                              replace_numbers=True,
                              replace_hashtags=True)
    s = time.time()
    ij = 0
    #dataTok.apply (lambda x: dataTok1.append(sent_tokenize(x)) )

    for row in dataTok:
        _sents = sent_tokenize(row)
        _sents = stop_word(_sents)
        for _sent in _sents:
            _temp = _sent.replace(".", "").replace(",", "").replace(
                "،", "").replace("؛", "").strip()
            _wrds = []
            _wrds = normalizer.normalize(_temp)
            dataTok1.append(tokenizer.tokenize(_wrds))

    print("Data: ", dataTok1.__len__())
    e = time.time()
    print("Tokenize Done, Time: ", e - s, " !\n")
コード例 #28
0
ファイル: preprocess.py プロジェクト: Gods-of-Bigdata/SS_yab
    def __init__(self,
                 corpus_path='resources/corpus.json',
                 symbols_json_path='resources/symbols.json',
                 persian_lang_path='resources/persian_lang.json',
                 postagger_model_path='resources/postagger.model',
                 max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False):
        self.postagger_model_path = postagger_model_path
        self.symbols_json_path = symbols_json_path
        self.corpus_path = corpus_path
        self.corpus = {}
        self.docs_num = 0
        self.expand_corpus = expand_corpus

        if self.corpus_path is not None:
            with open(corpus_path, encoding='utf-8') as json_file:
                corpus = json.load(json_file)
            self.corpus = corpus['corpus']
            self.docs_num = corpus['docs_num']

        with open(symbols_json_path, encoding='utf-8') as json_file:
            data = json.load(json_file)
        lst = list(data.values())
        self.all_symbols_list = [item for sublist in lst for item in sublist]

        with open(persian_lang_path, encoding='utf-8') as json_file:
            persian_lang = json.load(json_file)

        self.epic_keywords = persian_lang['epic_keywords']
        self.punctuations = persian_lang['punctuations']
        self.persian_alphabet = persian_lang['persian_alphabet']
        self.stop_words = persian_lang['stop_words']

        self.tagger = POSTagger(model=self.postagger_model_path)
        self.normalizer = Normalizer()
        self.max_keyword_num = max_keyword_num
        self.min_keyword_occurrences = min_keyword_occurrences
コード例 #29
0
stopwords = stopwords_f.readlines()
for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].replace("\n", "")
samewords_f = open('same_words.txt', 'r', encoding='utf-8')
samewords = samewords_f.readlines()
#samewords_tokens = word_tokenize(samewords_f.read(),"\n")
for i in range(len(samewords)):
    samewords[i] = samewords[i].replace("\n", "")
    samewords[i] = word_tokenize(samewords[i])
#print('same=' + str(samewords))
samewords_f.close()
stopwords_f.close()
#print('stop='+str(stopwords))

lemmatizer = Lemmatizer()
normalizer = Normalizer()
#print(query_process("ما تو را کودک،. کتابهای به برای دوست داریم خودرو را هنوز اتومبیل"))


@app.route('/api/dataframe', methods=['GET'])
def df():
    return j

def find_in_dictionary(word,dictionary1):
    if word in dictionary1:
        return dictionary1[word].copy()#returns a list of docIDs with where the term is
    else:
        return []


 
コード例 #30
0
 def __init__(self):
     self.stemmer = Stemmer()
     self.normalizer = Normalizer()
     self.punctuations = string.punctuation