Ejemplo n.º 1
0
def calculate_class_score(sentence, class_name, show_details=False):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if RussianStemmer().stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += (1 / corpus_words[RussianStemmer().stem(word.lower())])

            if show_details:
                print("   match: %s" % RussianStemmer().stem(word.lower()))
    return score
Ejemplo n.º 2
0
    def parse(self, fname):
        """
        Парсинг текста файла
        :param fname: имя файла
        :return: (<имя_файла>, тошнота, мошенничество)
        """
        density, fraud = 0, 0
        with codecs.open(fname, "r", encoding="utf-8") as f:
            text = f.read()
        tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+")
        txt_list = tknz.tokenize(text)
        if txt_list:
            for i, word in enumerate(txt_list):
                new_word = self.check_word(word)
                if new_word:
                    txt_list[i] = new_word
                    fraud += 1

            txt_list = [
                word.lower() for word in txt_list
                if not (word.lower() in self.sw)
            ]
            stemmer_ru = RussianStemmer()
            txt_list = [
                stemmer_ru.stem(token.lower()) for token in txt_list
                if len(token) > 1
            ]
            dict_w = Counter(txt_list)
            top5 = heapq.nlargest(5, dict_w, key=dict_w.get)
            top5_count = sum([dict_w[word] for word in top5])
            density = top5_count / len(txt_list)
        # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке
        # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать
        # готов обсуждать этот критерий, возможно исправить каким то образом
        return fname, density, fraud > 2
Ejemplo n.º 3
0
    def __init__(self):
        self.words = set()
        self.problems = {}
        self.appearances = {}

        self.filter = Filter()
        self.stemmer = RussianStemmer()
Ejemplo n.º 4
0
def stemming_sent(sent):
    pattern = re.compile('[a-zA-Zа-яА-Я]+')
    words = pattern.findall(sent)
    stemmer = RussianStemmer()
    words = list(map(lambda word: stemmer.stem(word), words))
    new_sent = functools.reduce(lambda x, y: x + ' ' + y, words)
    return new_sent
Ejemplo n.º 5
0
    def textrank(self, text, similar='serense'):
        text = treatment_text(text)
        text = text.split('.')
        text = list(filter(lambda x: len(x.split()) > 6, text))
        text = '.'.join(text)

        sentences = sent_tokenize(text)
        tokenizer = RegexpTokenizer(r'\w+')
        lmtzr = RussianStemmer()
        words = [
            set(
                lmtzr.stem(word)
                for word in tokenizer.tokenize(sentence.lower()))
            for sentence in sentences
        ]

        pairs = combinations(range(len(sentences)), 2)
        if similar == 'serense':
            scores = [(i, j, self.similarity_1(words[i], words[j]))
                      for i, j in pairs]
        if similar == 'cos':
            scores = [(i, j, self.similarity_2(words[i], words[j]))
                      for i, j in pairs]

        scores = filter(lambda x: x[2], scores)

        g = nx.Graph()
        g.add_weighted_edges_from(scores)
        pr = nx.pagerank(g)

        return sorted(
            ((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
            key=lambda x: pr[x[0]],
            reverse=True)
Ejemplo n.º 6
0
    def parse_text(self, text):
        text = list(text)

        for i in range(len(text)):
            is_cyrillic_symbol = False
            if text[i] >= 'А' and text[i] <= 'Я':
                is_cyrillic_symbol = True
            if text[i] >= 'а' and text[i] <= 'я':
                is_cyrillic_symbol = True

            if is_cyrillic_symbol == False:
                text[i] = ' '

        text = ''.join(text)
        text = text.split()
        filtered_words = [
            word for word in text if word not in stopwords.words('russian')
            and word not in self.badwords
        ]

        stemmer = RussianStemmer()

        for i in range(len(filtered_words)):
            filtered_words[i] = stemmer.stem(filtered_words[i])

        return filtered_words
Ejemplo n.º 7
0
    def learn(self, class_name):
        self.classes.add(class_name)
        print class_name
        self.words_freq[class_name] = {}
        if class_name is "internet":
            dir_name = learn_internet
        else:
            dir_name = learn_nointernet

        for file_name in os.listdir(dir_name):
            print "processing", file_name
            text = open(dir_name + "/" + file_name, "r").read().decode("utf-8")
            words = [word.lower() for word in tokenizers.extract_words(text)]
            self.docs_number += 1
            self.unique_words_set = self.unique_words_set | set(words)
            stemmer = RussianStemmer()
            for word in words:
                stemmed = stemmer.stem(word)
                if stemmed in self.words_freq[class_name]:
                    self.words_freq[class_name][stemmed] += 1
                else:
                    self.words_freq[class_name][stemmed] = 1

            if class_name in self.words_in_class:
                self.words_in_class[class_name] += len(words)
                self.docs_in_class[class_name] += 1
            else:
                self.words_in_class[class_name] = len(words)
                self.docs_in_class[class_name] = 1
Ejemplo n.º 8
0
    def test(self, mode, bpe_model_path=None):
        while True:
            file_path = input("File path: ").strip()
            file_path = r"C:\Users\lezgy\OneDrive\Рабочий стол\Data_summ\data.txt"
            if file_path == "q":
                break
            try:
                with open(file_path, "r", encoding="utf-8") as r:
                    article = r.read().strip().split("\n")
                    article = " ".join(article)
                    if mode in ["lemm", "stem", "gram", "base"]:
                        article = article.lower()
                        article = word_tokenize(article)
                        article = " ".join(article)
                    print(f"real_text : {article}")

                if mode == "lemm":
                    lemmatizer = mystem.Mystem()
                    article = preprocess_lemm(article, lemmatizer)
                elif mode == "stem":
                    stemmer = RussianStemmer(False)
                    article = preprocess_stemm(article, stemmer)
                elif mode == "gram":
                    token_model = youtokentome.BPE(model=bpe_model_path)
                    article = preprocess_gramm(article, token_model)
                self.test_calc(article)
            except Exception as e:
                print(e)
                print("File not found")
    def prep_stem(self, text):
        """
        Eng:
        ============================================================================
        :param text: Text for preprocessing;

        :return: Preprocessed text with all stemmed words.

        Stem all words with Porter stemmer.
        ============================================================================

        Ru:
        ============================================================================
        :param text: Текст для предобработки;

        :return: Обработанный текст, в котором каждое слово подвергнулось стеммингу.

        Стеммингует все слова с помощью стеммера Портера.
        ============================================================================
        """
        if isinstance(text, str):
            if self.lang == "ru":
                return " ".join(
                    [RussianStemmer().stem(word) for word in text.split()])
            return " ".join(
                [PorterStemmer().stem(word) for word in text.split()])
        else:
            raise TypeError("Argument must be str!")
Ejemplo n.º 10
0
def tokenize(text):
    def is_ok(item, stemmer):
        return True if item.lower() == item and all((elem.isalpha() and not elem in string.ascii_letters and not stemmer.stem(item) in stopwords) for elem in item) else False
    from nltk.stem.snowball import RussianStemmer
    stemmer = RussianStemmer(ignore_stopwords=True)
    tokens = word_tokenize(text)
    return [item for item in tokens if is_ok(item, stemmer)]
Ejemplo n.º 11
0
def stemming(corpus):
    stemmer = RussianStemmer()
    stems = []
    for comment in corpus:
        comment = comment.split()
        s = [stemmer.stem(word) for word in comment]
        stems.append(' '.join(s))
    return stems
Ejemplo n.º 12
0
def method2(tokens):
    print("The way 2")
    stemmer = RussianStemmer(False)
    dictionary = dict()
    for word in tokens:
        normal_form = stemmer.stem(word)
        dictionary[normal_form] = dictionary.get(normal_form, 0) + 1
    printDic(dictionary, 2)
Ejemplo n.º 13
0
def preprocessing(sentence):
    porter = RussianStemmer()
    punctuation = string.punctuation + "«»—•’"
    stop = stopwords.words('russian')

    for p in punctuation:
        sentence = sentence.replace(p, "")
    sentence = [porter.stem(word) for word in sentence.split() if word not in stop]
    return sentence
Ejemplo n.º 14
0
 def __init__(self):
     self.stop_words = list(set(stopwords.words('russian')).union(set(stopwords.words('english'))))
     self.vectorizer = CountVectorizer(max_df=0.75)
     self.transformer = TfidfTransformer()
     self.scaler = MaxAbsScaler()
     self.classifier = LogisticRegression()
     self.swearings_list = []
     self.stemmer = RussianStemmer()
     with open('swearings.txt', 'r') as file:
         self.swearings_list = list(map(self.stemmer.stem, file.read().split()))
Ejemplo n.º 15
0
def normailize_text(
        data,
        tok=RegexpTokenizer(r'\w[\w\/\-]+'),
        stemmers=[RussianStemmer(ignore_stopwords=True), PorterStemmer()]
):
    # tokenize text into words
    # sequentially apply all stemmers to tokenized words
    # join stemmed words back to sentences
    return [' '.join([reduce(lambda v,f: f.stem(v), stemmers, w) for w in tok.tokenize(line)])
            for line in data]
Ejemplo n.º 16
0
def index(pathh):
    cont = txt_reader(pathh)
    cont = txt_parser(cont)
    stem = RussianStemmer(False)
    stemmed_text = text_stemmer(cont, stem)
    token = stemmed_tokenizer(stemmed_text)
    token.append([])
    vect_tfidf = bool_tf_tfidf(token)[2]
    csv_safe(vect_tfidf)
    return vect_tfidf
Ejemplo n.º 17
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     choices = hay_forms.model_choices()
     self.fields["models"] = forms.ChoiceField(choices=choices,
                                               required=False,
                                               label='Искать',
                                               widget=forms.RadioSelect,
                                               initial=choices[0][0])
     self.stopwords = set(stopwords.words('russian'))
     self.stemmer = RussianStemmer()
     self.tokenizer = RegexpTokenizer(r'\w+')
Ejemplo n.º 18
0
    def __init__(self, lang, make_delimiters=True):
        self.__stemmers = {
            'en': EnglishStemmer(),
            'ru': RussianStemmer(),
        }

        if lang not in ('en', 'ru'):
            raise ValueError

        self.__active_stemmer = self.__stemmers[lang]
        self.__make_delimiters = make_delimiters
Ejemplo n.º 19
0
 def __init__(self):
     """
     vect_theme - векторизатор для строк-тем
     vect_poem - векторизатор для строк-стихов
     lin_model - обученная модель логрегрессии
     """
     self.vect_theme = None
     self.vect_poem = None
     self.lin_model = None
     self.stemmer = RussianStemmer(True)
     self.stop_w = stopwords.words('russian')
Ejemplo n.º 20
0
def nltk_preprocessor(sentences):
    ''' токенизация + стемминг'''
    
    tokenizer = RegexpTokenizer(r'\w+')
    # стемминг до корневой основы
    lmtzr = RussianStemmer()
    words = [set(lmtzr.stem(word)                                # стемминг
                for word in tokenizer.tokenize(sentence.lower()) # токенизация
             )
             for sentence in sentences
    ]
    return words
Ejemplo n.º 21
0
 def __init__(self, stopwords, ignorechars, docs):
     self.stemmer = RussianStemmer()
     self.wdict = {}
     self.dictionary = []
     self.stopwords = stopwords
     if type(ignorechars) == unicode:
         ignorechars = ignorechars.encode('utf-8')
     self.ignorechars = ignorechars
     self.docss = []
     self.docs = docs
     for doc in docs:
         self.add_doc(doc)
Ejemplo n.º 22
0
def pre_process(string):
    s = lower_case(string)
    s = fix_lt(s)
    s = strip_punctuation(s)
    s = remove_stop_words(s, stop_words_ru)
    s = remove_stop_words(s, stop_words_eng)
    s = compact_whitespace(s)
    s = replace_numeric_with_literal(s)
    stemmer = RussianStemmer()
    s = stem(s, stemmer, stop_words_ru)
    stemmer = EnglishStemmer()
    s = stem(s, stemmer, stop_words_eng)
    return s.strip()
Ejemplo n.º 23
0
    def __init__(self, l):
        self._p = re.compile(r'\W+')

        assert (l == 'r' or l == 'n' or l == 'e')

        if l == 'r':
            self._stemmer = RussianStemmer()
            self._stopWords = stopwords.words('russian')
        elif l == 'e':
            self._stemmer = EnglishStemmer()
            self._stopWords = stopwords.words('english')
        elif l == 'n':
            self._stemmer = NorwegianStemmer()
            self._stopWords = stopwords.words('norwegian')
Ejemplo n.º 24
0
def process_file_stemm(file_name):
    print(file_name)
    cnt = 0
    result = []
    stemmer = RussianStemmer(False)
    with open(file_name, "r", encoding="utf-8") as r:
        for line in r.readlines():
            try:
                line = prepare_stem_data(line, stemmer)
                result.append(line)
            except:
                cnt += 1
    print(f"Bad lines: {cnt}")
    with open(file_name.replace("chunked_lenta_news", "chunked_lenta_news_gramms"), "w", encoding="utf-8") as w:
        w.write("\n".join(result))
Ejemplo n.º 25
0
def detect_cheat_in_text(text):
    """Detect cheats in text"""
    new_text = []
    is_cheat = False
    for word in text:
        is_cheated_word, recovery_token = detect_cheat(word)
        if is_cheated_word:
            is_cheat = True
            new_text.append(recovery_token)
    stop_words = set(stopwords.words('russian'))

    st = RussianStemmer()

    new_text = [word for word in new_text if (word not in stop_words)]
    return is_cheat, [st.stem(word) for word in new_text]
Ejemplo n.º 26
0
def context2vec(context,
                emb,
                max_len=30,
                emb_size=300,
                stemmer=RussianStemmer()):
    assert len(context) == 4
    vecs = None
    for sent in context:
        sent = clear_sent(sent, stemmer)
        vec = sent2vec(sent, emb, max_len, emb_size)
        if vecs is None:
            vecs = vec
        else:
            vecs = np.concatenate([vecs, vec], axis=0)

    return vecs
 def __init__(self):
     self.model = joblib.load("./models/clf.pkl")
     self.vectorizer = joblib.load("./models/vectorizer.pkl")
     self.classes_dict = {
         0: "отрицательный",
         1: "положительный",
         -1: "ошибка"
     }
     self.numbers_str = '0123456789'
     self.punc_translator = str.maketrans(string.punctuation,
                                          ' ' * len(string.punctuation))
     self.num_translator = str.maketrans(self.numbers_str,
                                         ' ' * len(self.numbers_str))
     self.short_word_len = 1
     self.stemmer = RussianStemmer()
     self.stop_words = stopwords.words('russian') + ['br']
Ejemplo n.º 28
0
    def stem_words(self, words):
        """ Stem words by Porter or Snowball stemmers and join to one string """

        stemmer = None

        if self.lang == 'uk':
            return ' '.join(
                [UkrainianStemmer(word).stem_word() for word in words])

        elif self.lang == 'ru':
            stemmer = RussianStemmer()

        elif self.lang == 'en':
            stemmer = EnglishStemmer()

        return ' '.join([stemmer.stem(word) for word in words])
Ejemplo n.º 29
0
def init():
    global VOCAB_SIZE
    global token_2_idx
    global stem_cache
    global stemer
    global regex

    VOCAB_SIZE = 5000
    stem_vocab = pd.read_json("stem_vocab.json", encoding='utf-8')
    token_2_idx = {stem_vocab.values[i][0]: i for i in range(VOCAB_SIZE)}

    with open('stem_cache.json') as f:
        stem_cache = json.load(f)

    stemer = RussianStemmer()
    regex = re.compile('[^а-яА-Я ]')
Ejemplo n.º 30
0
    def stem_keyword(self):
        """ Stem keyword by Porter or Snowball stemmers """

        if self.language == 'uk':
            self.keyword = UkrainianStemmer(self.keyword).stem_word()
            return

        elif self.language == 'ru':
            stemmer = RussianStemmer()

        elif self.language == 'en':
            stemmer = EnglishStemmer()

        else:
            return

        self.keyword = stemmer.stem(self.keyword)