def learn(self, class_name): self.classes.add(class_name) print class_name self.words_freq[class_name] = {} if class_name is "internet": dir_name = learn_internet else: dir_name = learn_nointernet for file_name in os.listdir(dir_name): print "processing", file_name text = open(dir_name + "/" + file_name, "r").read().decode("utf-8") words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def textrank(self, text, similar='serense'): text = treatment_text(text) text = text.split('.') text = list(filter(lambda x: len(x.split()) > 6, text)) text = '.'.join(text) sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [ set( lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences ] pairs = combinations(range(len(sentences)), 2) if similar == 'serense': scores = [(i, j, self.similarity_1(words[i], words[j])) for i, j in pairs] if similar == 'cos': scores = [(i, j, self.similarity_2(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted( ((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def fillDicts(self, maxDocs=0): self.classes = set() self.documentsInClass = dict() #количество документов в классе self.documentsNumber = 0 # число документов self.uniqueWords = set() # множество уникальных слов self.wordsInClass = dict() # количество слов в классе self.wordsFreq = dict() # частота появления слова в классе i = 0 for document in self.collection.find(): i += 1 if i > maxDocs and maxDocs > 0: break if i % 100 == 0: print "Processed " + str(i) + " documents" self.classes.add(document['topic']) match = re.findall(re.compile(u"[а-яА-Яa-zA-Z0-9]*"), document['body']) match = [word for word in match if word != ''] self.documentsNumber += 1 self.uniqueWords = self.uniqueWords | set(match) wordsFreq = dict() stemmer = RussianStemmer() for _match in match: stemmed = stemmer.stem(_match) if stemmed in wordsFreq: wordsFreq[stemmed] += 1 else: wordsFreq[stemmed] = 1 if document['topic'] in self.wordsInClass: self.wordsInClass[document['topic']] += len(match) self.wordsFreq[document['topic']].update(wordsFreq) self.documentsInClass[document['topic']] += 1 else: self.wordsInClass[document['topic']] = len(match) self.wordsFreq[document['topic']] = wordsFreq self.documentsInClass[document['topic']] = 1
def parse(self, fname): """ Парсинг текста файла :param fname: имя файла :return: (<имя_файла>, тошнота, мошенничество) """ density, fraud = 0, 0 with codecs.open(fname, "r", encoding="utf-8") as f: text = f.read() tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+") txt_list = tknz.tokenize(text) if txt_list: for i, word in enumerate(txt_list): new_word = self.check_word(word) if new_word: txt_list[i] = new_word fraud += 1 txt_list = [ word.lower() for word in txt_list if not (word.lower() in self.sw) ] stemmer_ru = RussianStemmer() txt_list = [ stemmer_ru.stem(token.lower()) for token in txt_list if len(token) > 1 ] dict_w = Counter(txt_list) top5 = heapq.nlargest(5, dict_w, key=dict_w.get) top5_count = sum([dict_w[word] for word in top5]) density = top5_count / len(txt_list) # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать # готов обсуждать этот критерий, возможно исправить каким то образом return fname, density, fraud > 2
def parse_text(self, text): text = list(text) for i in range(len(text)): is_cyrillic_symbol = False if text[i] >= 'А' and text[i] <= 'Я': is_cyrillic_symbol = True if text[i] >= 'а' and text[i] <= 'я': is_cyrillic_symbol = True if is_cyrillic_symbol == False: text[i] = ' ' text = ''.join(text) text = text.split() filtered_words = [ word for word in text if word not in stopwords.words('russian') and word not in self.badwords ] stemmer = RussianStemmer() for i in range(len(filtered_words)): filtered_words[i] = stemmer.stem(filtered_words[i]) return filtered_words
def stem_corpus(input_path, output_path): stem = RussianStemmer() last_word = '' i = 0 with open(output_path, 'w', encoding='utf8') as o: with open(input_path, 'r', encoding='utf8') as f: while True: s = f.read(1024 * 1024) if not s or not len(s): o.write(last_word) break words = s.split(' ') if s[0] != ' ': # last_word was incomplete words[0] = last_word + words[0] for word in words[:-1]: stemmed = stem.stem(word) o.write(stemmed + ' ') i += 1 print('Stemmed {} MBs'.format(i)) last_word = words[-1]
def stemming_sent(sent): pattern = re.compile('[a-zA-Zа-яА-Я]+') words = pattern.findall(sent) stemmer = RussianStemmer() words = list(map(lambda word: stemmer.stem(word), words)) new_sent = functools.reduce(lambda x, y: x + ' ' + y, words) return new_sent
def textToWordList(txt): p_stemmer = RussianStemmer() tokenizer = RegexpTokenizer(r'\w+') stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')] r = re.compile('^[а-я]+$') badword =[ 'дом', 'город', "дорог", "час", "ноч", "слов", "утр", "стран", "пут", "путешеств", "мест", 'нов', "друз", "добр" ] txt = txt.lower().replace("<br>", "\n") tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)] tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword] return tokens
def stemming(corpus): stemmer = RussianStemmer() stems = [] for comment in corpus: comment = comment.split() s = [stemmer.stem(word) for word in comment] stems.append(' '.join(s)) return stems
def method2(tokens): print("The way 2") stemmer = RussianStemmer(False) dictionary = dict() for word in tokens: normal_form = stemmer.stem(word) dictionary[normal_form] = dictionary.get(normal_form, 0) + 1 printDic(dictionary, 2)
def preprocessing(sentence): porter = RussianStemmer() punctuation = string.punctuation + "«»—•’" stop = stopwords.words('russian') for p in punctuation: sentence = sentence.replace(p, "") sentence = [porter.stem(word) for word in sentence.split() if word not in stop] return sentence
def build_stemmer_morphology(data_filename, output_filename): vocab = load_vocab(data_filename) print 'Total words in vocab: %d' % len(vocab) prefix_map = defaultdict(set) stemmer = RussianStemmer() for w in vocab: prefix_map[stemmer.stem(w)].add(w) print 'Total lemm groups: %d' % (len(prefix_map)) write_morphology(prefix_map, output_filename)
def __init__(self): self.stop_words = list(set(stopwords.words('russian')).union(set(stopwords.words('english')))) self.vectorizer = CountVectorizer(max_df=0.75) self.transformer = TfidfTransformer() self.scaler = MaxAbsScaler() self.classifier = LogisticRegression() self.swearings_list = [] self.stemmer = RussianStemmer() with open('swearings.txt', 'r') as file: self.swearings_list = list(map(self.stemmer.stem, file.read().split()))
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) choices = hay_forms.model_choices() self.fields["models"] = forms.ChoiceField(choices=choices, required=False, label='Искать', widget=forms.RadioSelect, initial=choices[0][0]) self.stopwords = set(stopwords.words('russian')) self.stemmer = RussianStemmer() self.tokenizer = RegexpTokenizer(r'\w+')
def nltk_preprocessor(sentences): ''' токенизация + стемминг''' tokenizer = RegexpTokenizer(r'\w+') # стемминг до корневой основы lmtzr = RussianStemmer() words = [set(lmtzr.stem(word) # стемминг for word in tokenizer.tokenize(sentence.lower()) # токенизация ) for sentence in sentences ] return words
def calculate_class_score(sentence, class_name, show_details=False): score = 0 # tokenize each word in our new sentence for word in nltk.word_tokenize(sentence): # check to see if the stem of the word is in any of our classes if RussianStemmer().stem(word.lower()) in class_words[class_name]: # treat each word with same weight score += (1 / corpus_words[RussianStemmer().stem(word.lower())]) if show_details: print(" match: %s" % RussianStemmer().stem(word.lower())) return score
def __init__(self, stopwords, ignorechars, docs): self.stemmer = RussianStemmer() self.wdict = {} self.dictionary = [] self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars self.docss = [] self.docs = docs for doc in docs: self.add_doc(doc)
def detect_cheat_in_text(text): """Detect cheats in text""" new_text = [] is_cheat = False for word in text: is_cheated_word, recovery_token = detect_cheat(word) if is_cheated_word: is_cheat = True new_text.append(recovery_token) stop_words = set(stopwords.words('russian')) st = RussianStemmer() new_text = [word for word in new_text if (word not in stop_words)] return is_cheat, [st.stem(word) for word in new_text]
def tokenize(text): def is_ok(item, stemmer): return True if item.lower() == item and all((elem.isalpha() and not elem in string.ascii_letters and not stemmer.stem(item) in stopwords) for elem in item) else False from nltk.stem.snowball import RussianStemmer stemmer = RussianStemmer(ignore_stopwords=True) tokens = word_tokenize(text) return [item for item in tokens if is_ok(item, stemmer)]
def test(self, mode, bpe_model_path=None): while True: file_path = input("File path: ").strip() file_path = r"C:\Users\lezgy\OneDrive\Рабочий стол\Data_summ\data.txt" if file_path == "q": break try: with open(file_path, "r", encoding="utf-8") as r: article = r.read().strip().split("\n") article = " ".join(article) if mode in ["lemm", "stem", "gram", "base"]: article = article.lower() article = word_tokenize(article) article = " ".join(article) print(f"real_text : {article}") if mode == "lemm": lemmatizer = mystem.Mystem() article = preprocess_lemm(article, lemmatizer) elif mode == "stem": stemmer = RussianStemmer(False) article = preprocess_stemm(article, stemmer) elif mode == "gram": token_model = youtokentome.BPE(model=bpe_model_path) article = preprocess_gramm(article, token_model) self.test_calc(article) except Exception as e: print(e) print("File not found")
class Tokenizer(object): def __init__(self): self.cache = {} self.r_stemmer = RussianStemmer() self.e_stemmer = EnglishStemmer() def process_word(self, w): if w in self.cache: return self.cache[w] else: struct = check_structure(w) if struct == 'TRASH': w_proc = '' elif struct == 'WORD': if is_ascii(w): w_proc = self.e_stemmer.stem(w) else: w_proc = self.r_stemmer.stem(w) elif struct == 'NUMBER': w_proc = '' elif struct == 'COMPLEX': w_proc = w self.cache[w] = w_proc return w_proc def tokenize(self, text): text = preprosess_text(text) words = text.split(' ') tokens = [] for w in words: tokens.append(self.process_word(w)) tokens = [t for t in tokens if len(t)] return tokens
def __init__(self): self.model = joblib.load("./models/clf.pkl") self.vectorizer = joblib.load("./models/vectorizer.pkl") self.classes_dict = { 0: "отрицательный", 1: "положительный", -1: "ошибка" } self.numbers_str = '0123456789' self.punc_translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) self.num_translator = str.maketrans(self.numbers_str, ' ' * len(self.numbers_str)) self.short_word_len = 1 self.stemmer = RussianStemmer() self.stop_words = stopwords.words('russian') + ['br']
def __init__(self): self.words = set() self.problems = {} self.appearances = {} self.filter = Filter() self.stemmer = RussianStemmer()
def stem_words(self, words): """ Stem words by Porter or Snowball stemmers and join to one string """ stemmer = None if self.lang == 'uk': return ' '.join( [UkrainianStemmer(word).stem_word() for word in words]) elif self.lang == 'ru': stemmer = RussianStemmer() elif self.lang == 'en': stemmer = EnglishStemmer() return ' '.join([stemmer.stem(word) for word in words])
def prep_stem(self, text): """ Eng: ============================================================================ :param text: Text for preprocessing; :return: Preprocessed text with all stemmed words. Stem all words with Porter stemmer. ============================================================================ Ru: ============================================================================ :param text: Текст для предобработки; :return: Обработанный текст, в котором каждое слово подвергнулось стеммингу. Стеммингует все слова с помощью стеммера Портера. ============================================================================ """ if isinstance(text, str): if self.lang == "ru": return " ".join( [RussianStemmer().stem(word) for word in text.split()]) return " ".join( [PorterStemmer().stem(word) for word in text.split()]) else: raise TypeError("Argument must be str!")
class PhraseStemmer(PhraseSplitter): def __init__(self): self.tokenizer = Tokenizer() self.stemmer = RussianStemmer() def tokenize(self, phrase): return [self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase) if len(w.strip()) > 0]
def textrank(text): sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [ set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences ] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def stem_keyword(self): """ Stem keyword by Porter or Snowball stemmers """ if self.language == 'uk': self.keyword = UkrainianStemmer(self.keyword).stem_word() return elif self.language == 'ru': stemmer = RussianStemmer() elif self.language == 'en': stemmer = EnglishStemmer() else: return self.keyword = stemmer.stem(self.keyword)
def __init__(self, vocabulary_size=5000, debug=False): self.stemmer = RussianStemmer() self.stem_count = Counter() self.validator_regex = re.compile(r'[^А-яЁё]') self.cache_stems = {} self.vocabulary = None self.vocabulary_size = vocabulary_size self.debug = debug self.positive_tweets = None self.negative_tweets = None self.tweets_vectors = None self.labels = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.model = None
def wrk_words_wt_no(sent): """Making stemming""" # morph = pymorphy2.MorphAnalyzer() stemmer = RussianStemmer() words=word_tokenize(sent.lower()) try: arr=[] for i in range(len(words)): if re.search(u'[а-яА-Я]',words[i]): arr.append(stemmer.stem(words[i]))###стемминг # arr.append(morph.parse(words[i])[0].normal_form)###лемматизация words1=[w for w in arr if w not in russian_stops] words1=No_with_word(words1) return words1 except TypeError: pass
def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer()
def cleanText(textToClean): myPunctuation = u'–«»—…' exclude = set(string.punctuation + myPunctuation) #textToClean = unicode(textToClean, "utf-8") textToClean = ''.join(ch for ch in textToClean if ch not in exclude) textToClean = ''.join([i for i in textToClean if not i.isdigit()]) stop_words = get_stop_words('ru') words_after_deleting_stop_words = [w for w in textToClean.split() if (not w in stop_words and w in model.vocab)] rs = RussianStemmer() words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words] text_after_cleaning = ' '.join(words_after_stemming) #text_after_cleaning = text_after_cleaning.replace(u'кпм', '').replace(u'пмп', '') if text_after_cleaning: return text_after_cleaning
def normailize_text( data, tok=RegexpTokenizer(r'\w[\w\/\-]+'), stemmers=[RussianStemmer(ignore_stopwords=True), PorterStemmer()] ): # tokenize text into words # sequentially apply all stemmers to tokenized words # join stemmed words back to sentences return [' '.join([reduce(lambda v,f: f.stem(v), stemmers, w) for w in tok.tokenize(line)]) for line in data]
def index(pathh): cont = txt_reader(pathh) cont = txt_parser(cont) stem = RussianStemmer(False) stemmed_text = text_stemmer(cont, stem) token = stemmed_tokenizer(stemmed_text) token.append([]) vect_tfidf = bool_tf_tfidf(token)[2] csv_safe(vect_tfidf) return vect_tfidf
class findSubject: SUBJECTS_NAME = [] SUBJECTS_REAL_NAME = [] IS_LOADED_SUBJECTS = False regex = 0 stemer = 0 def __init__(self): self.stemer = RussianStemmer() self.regex = re.compile('[^а-яА-Я ]') self.load_subjects('textParsing/data/subjects.csv') def get_stem(self, token, checkHash=True): token = self.regex.sub('', token).lower() stem = self.stemer.stem(token) return stem def load_subjects(self, filepath): pd_subjects = pd.read_csv(filepath, delimiter=';') self.SUBJECTS_NAME = list(np.array(pd_subjects[['name']])) self.SUBJECTS_REAL_NAME = list(np.array(pd_subjects[['subject']])) for ind in range(len(self.SUBJECTS_NAME)): self.SUBJECTS_NAME[ind] = self.get_stem( str(self.SUBJECTS_NAME[ind][0]), False) self.IS_LOADED_SUBJECTS = True def get(self, text): sent = text.split(' ') find_fst_po = -1 for ind, word in enumerate(sent): if word == 'по': find_fst_po = ind break if (find_fst_po == -1): return None subjects = set() for ind, word in enumerate(sent): if (ind > find_fst_po): word = self.get_stem(word, False) if (word in self.SUBJECTS_NAME): subjects.add( str(self.SUBJECTS_REAL_NAME[self.SUBJECTS_NAME.index( word)])) if (len(subjects) == 0): return None return subjects
def textrank(text): """ TextRank algorithm for text summarization. https://gist.github.com/igor-shevchenko/5821166 """ sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') lmtzr = RussianStemmer() words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower())) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def __init__(self): """ vect_theme - векторизатор для строк-тем vect_poem - векторизатор для строк-стихов lin_model - обученная модель логрегрессии """ self.vect_theme = None self.vect_poem = None self.lin_model = None self.stemmer = RussianStemmer(True) self.stop_w = stopwords.words('russian')
def learn(self, class_name): dir_name = "." file_name = "tweets_by_trend.xml" self.classes.add(class_name) self.words_freq[class_name] = {} if class_name is "negative": code = 0 else: code = 1 print "processing", file_name tree = ET.parse(dir_name + "/" + file_name) root = tree.getroot() for tweet in root.findall('tweet'): sent = int(tweet.find('sent').text) if sent == code: text = tweet.find('text').text words = [word.lower() for word in tokenizers.extract_words(text)] self.docs_number += 1 self.unique_words_set = self.unique_words_set | set(words) stemmer = RussianStemmer() for word in words: stemmed = stemmer.stem(word) if stemmed in self.words_freq[class_name]: self.words_freq[class_name][stemmed] += 1 else: self.words_freq[class_name][stemmed] = 1 if class_name in self.words_in_class: self.words_in_class[class_name] += len(words) self.docs_in_class[class_name] += 1 else: self.words_in_class[class_name] = len(words) self.docs_in_class[class_name] = 1
def cleanText(text): ''' Function checks and repairs words with hidden latin characters in and vv. Function assuming that there are only latin and cyrillic characters in text. ''' ad = AlphabetDetector() st = RussianStemmer() is_broken = False clean_text = [] for word in text: if ad.only_alphabet_chars(word, 'CYRILLIC'): clean_text.append(word) elif ad.only_alphabet_chars(word, 'LATIN'): clean_text.append(word) else: is_broken = True clean_text.append(letterSwap(word)) clean_text = [st.stem(word) for word in clean_text] return clean_text, is_broken
class LSI(object): def __init__(self, stopwords, ignorechars, docs): self.stemmer = RussianStemmer() self.wdict = {} self.dictionary = [] self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars self.docss = [] self.docs = docs for doc in docs: self.add_doc(doc) def prepare(self): self.build() self.calc() def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') word = self.stemmer.stem(word) if word in self.dictionary: return self.dictionary.index(word) else: if add: self.dictionary.append(word) return len(self.dictionary) - 1 else: return None def add_doc(self, doc): words = [self.dic(word, True) for word in doc.lower().split()] self.docss.append(words) for word in words: if word in self.stopwords: continue elif word in self.wdict: self.wdict[word].append(len(self.docs) - 1) else: self.wdict[word] = [len(self.docs) - 1] def build(self): self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() self.A = np.zeros([len(self.keys), len(self.docs)]) for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1 def calc(self): self.U, self.S, self.Vt = svd(self.A) def TFIDF(self): wordsPerDoc = sum(self.A, axis=0) docsPerWord = sum(np.asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i]) def dump_src(self): self.prepare() print 'Здесь представлен расчет матрицы ' for i, row in enumerate(self.A): print self.dictionary[i], row def print_svd(self): self.prepare() print 'Здесь сингулярные значения' print self.S print 'Здесь первые 3 колонки U матрица ' for i, row in enumerate(self.U): print self.dictionary[self.keys[i]], row[0:3] print 'Здесь первые 3 строчки Vt матрица' print -1*self.Vt[0:3, :] def find(self, word): self.prepare() idx = self.dic(word) if not idx: print 'слово невстерчается' return [] if not idx in self.keys: print 'слово отброшено как не имеющее значения которое через stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # получаем координаты слова wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): ax, ay = xx[k], yy[k] dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) return sorted(arts, key = lambda a: a[4])
new_d.pop(key) return new_d # load pazans pazans_groups = None pazans_file_name = sys.argv[1] with open(pazans_file_name, "r") as pazans_file: pazans_groups = json.loads(pazans_file.read()) # analyze statues status_stats = dict() tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") stemmer = RussianStemmer() users_file_name = sys.argv[2] with open(users_file_name, "r") as users_file: for line in users_file: user = json.loads(line) uid = str(user["_id"]) if uid in pazans_groups: pazan_groups = pazans_groups[uid] status_text = user.get("status", "") filtered_status_text = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(status_text)]) if len(filtered_status_text) > 1: status_stats_item = status_stats.get(filtered_status_text, { "full": status_text, "count-boys": 0, "count-girls": 0,
from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC,NuSVC,LinearSVC from sklearn.neural_network import BernoulliRBM from sklearn.linear_model import LogisticRegression,Perceptron from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.decomposition import TruncatedSVD,NMF,FactorAnalysis,PCA from nltk.stem.snowball import RussianStemmer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from nltk import word_tokenize from nltk.tokenize.api import StringTokenizer from nltk.corpus import stopwords import numpy st=RussianStemmer() libra=pd.read_excel('libra.xls')[['body','ticket_queue_id']].dropna() libra.body=pd.Series(st.stem(x) for x in libra.body) libra=libra.dropna() classifier=SVC(probability=True,kernel='linear') from nltk.stem.snowball import RussianStemmer import nltk st = RussianStemmer()
from decimal import * reload(sys) sys.setdefaultencoding("utf-8") from stop_words import get_stop_words # next line delete file content open('text_after_cleaning.csv', 'w').close() with open('text_after_cleaning.csv', 'w') as data_csv: fieldnames = ['post_text', 'stars'] writer = csv.DictWriter(data_csv, fieldnames=fieldnames) writer.writeheader() with open('items.csv') as csvfile: reader = csv.DictReader(csvfile) myPunctuation = u'–«»' exclude = set(string.punctuation+myPunctuation) for row in reader: text_before_cleaning = row['post_text'] post_text = row['post_text'] post_text = unicode(post_text, "utf-8") post_text = ''.join([i for i in post_text if not i.isdigit()]) post_words = post_text.split() stop_words = get_stop_words('ru') words_after_deleting_stop_words = [w for w in post_text.split() if not w in stop_words] rs = RussianStemmer() words_after_stemming = [rs.stem(w) for w in words_after_deleting_stop_words] text_after_cleaning = ' '.join(words_after_stemming) if text_after_cleaning: writer.writerow({'post_text': text_after_cleaning, 'stars': row['stars']})
# -*- coding: utf-8 -*- """ Created on Thu Jul 21 18:10:41 2016 @author: asamoylov """ from nltk.stem.snowball import RussianStemmer mystem = RussianStemmer() str0 = "поздно" print mystem.stem(str0.decode("utf-8"))
class KareninaParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer() def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True self.doc_id += 1 else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: self.paragraphs.append(data) terms = set() for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')): if token[0] in string.punctuation: continue self.token_count += 1 self.token_sum_len += len(token) term = self.stemmer.stem(token) if not term in terms: terms.add(term) if self.iindex.has_key(term): self.iindex[term].append(self.doc_id) else: self.iindex[term] = [ self.doc_id ] def dump_iindex(self, output_name): output = open(output_name, 'wb') pickle.dump(self.iindex, output) output.close() def dump_paragraphs(self, output_name): output = open(output_name, 'wb') pickle.dump(self.paragraphs, output) output.close() def get_stat(self): term_sum_len = 0 for term in self.iindex.keys(): term_sum_len += len(term) term_count = len(self.iindex.keys()) if not (term_count and self.token_count): self.stat = {} else: self.stat = { 'token_count': self.token_count, 'token_avg_len': self.token_sum_len/float(self.token_count), 'term_count': term_count, 'term_avg_len': term_sum_len/float(term_count) } return self.stat def print_iindex(self): for term in sorted(self.iindex.keys()): posting_list = self.iindex[term] print term print len(posting_list) print posting_list print '---------------------'
def stemData(posts): global happy global sad global invert global shouldStemData statHap = {} statSad = {} statAll = {} from nltk.stem.snowball import RussianStemmer from nltk import word_tokenize, sent_tokenize from gensim.models.doc2vec import LabeledSentence stemmer = RussianStemmer() toRet = [] curI = 0 if shouldStemData: # renew smiles happy = stemmer.stem(happy) sad = stemmer.stem(sad) positives = [] negatives = [] for i in range(0, len(posts)): if i % 10000 == 0: print i sentences = sent_tokenize(posts[i]) for j in range(0, len(sentences)): words = word_tokenize(sentences[j]) import string for k in range(0, len(words)): try: if shouldStemData and words[k] not in invert: words[k] = unicode(stemmer.stem(words[k])) # words[k] = cyr_to_r(words[k]).encode('utf8') letters = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' words[k] = filter(lambda x: x in letters + string.letters + string.digits + '.!?', words[k]) except Exception: print 'failed word: ' + words[k] raise Exception('') try: if words == []: del sentences[j] continue if words == [happy, '.']: sentences[j] = LabeledSentence(words=words, tags=[happy]) if j > 0: positives += [curI - 1] elif words == [sad, '.']: sentences[j] = LabeledSentence(words=words, tags=[sad]) if j > 0: negatives += [curI - 1] else: for word in words: if word in statAll: statAll[word] += 1 else: statAll[word] = 1 if happy in words: positives += [curI] while happy in words: words.remove(happy) for word in words: if word in statHap: statHap[word] += 1 else: statHap[word] = 1 if sad in words: negatives += [curI] while sad in words: words.remove(sad) for word in words: if word in statSad: statSad[word] += 1 else: statSad[word] = 1 sentences[j] = LabeledSentence(words=words, tags=[str(curI)]) curI += 1 except Exception, e: print words sentences[j] = [''] raise e toRet += sentences