def count_simple_stats(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') texts_count = 0 sent_count = 0 words_count = 0 symbols_count = 0 for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols) # print([token.txt for token in tokens]) print("Texts count:", texts_count) print("Sentences count:", sent_count) print("Words count:", words_count) print("Symbols count:", symbols_count)
class SpacyTokenizer: def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer") def tokenize(self, text): return [token.text for token in self.nlp(text) if token.text.strip()]
def tokenize(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols)
class TimofeevTokenizer: label = 'aatimofeev/spacy_russian_tokenizer' def __init__(self): from spacy.lang.ru import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer') def __call__(self, text): doc = self.nlp(text) chunks = (token.text for token in doc) return find_substrings(chunks, text)
def text_decomposition(text, lang='de'): if lang == 'de': nlp = spacy.load('de_core_news_md') elif lang == 'en': nlp = spacy.load("en_core_web_md") elif lang == 'ru': nlp = Russian() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: print("Unsupported language. Choose from ['en', 'de', 'ru']") return doc = nlp(text) sentences = list() for sent in doc.sents: sentences.append(sent.text) return sentences
def spacy_tokenize2(text): from spacy.lang.ru import Russian from spacy_russian_tokenizer import ( RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES ) global NLP2 if not NLP2: NLP2 = Russian() NLP2.add_pipe( RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer' ) doc = NLP2(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
class RusWordTokenizer(PreProcesser): def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer') def transform_text(self, text): return [Token(token_id, token.text) for token_id, token in enumerate(self.rus_word_tokenizer(text), 1)] def transform_sent(self, sent): sent = sent.copy() sent.tokens = self.transform_text(sent.text) return sent def transform_item(self, x): return [self.transform_sent(sent) for sent in x]
def spacy_sentence_scores(self) -> Dict[str, float]: nlp = Russian() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer) raw_text = self.text docx = nlp(raw_text) stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text not in stopwords: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word not in word_frequencies.keys(): word_frequencies[word.word] = 1 else: word_frequencies[word.word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequency) sentence_list = [sentence for sentence in docx.sents] sentence_scores = {} for sent in sentence_list: for word in sent: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.word] else: sentence_scores[sent] += word_frequencies[word.word] return sentence_scores
def tokenizer(inp): nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') return nlp(inp)
class SpacyRulesRussianTokenizer(): """ Tokenizer based on https://github.com/aatimofeev/spacy_russian_tokenizer.git Tokenizer was built on spacy and use spacy standart tokenization pipeline. You can read more about it here: * https://spacy.io/usage/linguistic-features#section-tokenization * https://spacy.io/usage/rule-based-matching Installation instruction: 1) pip install spacy 2) pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git """ def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser') @staticmethod def create_custom_pretokenizer(nlp_model, prefix_regexp, suffix_regexp, infix_regexp, token_match_regexp): custom_pretokenizer = SpacyBaseTokenizer( nlp_model.vocab, prefix_search=prefix_regexp.search, suffix_search=suffix_regexp.search, infix_finditer=infix_regexp.finditer, token_match=token_match_regexp.match, ) return custom_pretokenizer def transform_element(self, element): """ Get tokenization variant of the element. Parameters ---------- element : str String, supposed to be a sentence, one document or something analogous. Return ------ tokens_array : list of str Tokenized string """ if not isinstance(element, str): raise TypeError( f"Cannot tokenize {type(element)} instead of {type('')}!") tokens_array = [token.text for token in self.nlp_pipeline(element)] return tokens_array def transform(self, elements_collection): """ Apply transformer to collection of elements (objects). Parameters ---------- elements_collection : iterable of optional Collection of objects to be transformed. Returns ------- transformed_elements : list of optional Collection of transformed objects. """ transformed_elements = [ self.transform_element(element) for element in elements_collection ] return transformed_elements
from nltk.corpus import stopwords import stanza from spacy_stanza import StanzaLanguage #nltk.download("stopwords") #stanza.download('ru') # will take a while #russian_stopwords = stopwords.words("russian") russian_stopwords = spacy.lang.ru.stop_words.STOP_WORDS # ================================================== EXAMPLE =================================================== text = "Не ветер, а какой-то ураган!" nlp = Russian() doc = nlp(text) russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') doc = nlp(text) print([token.text for token in doc]) # ============================================================================================================= toxic_rus = pd.read_csv("./__DATA/NLP_Datasets/toxic_russian.csv") toxic_rus.head() toxic_rus.info() # Removing punctuation toxic_rus['comment'] = toxic_rus['comment'].str.replace(r'[.,!?<>-]', '') toxic_rus['comment'] = toxic_rus['comment'].str.replace("\n"," ") toxic_rus['comment'] = toxic_rus['comment'].str.replace("\t"," ") toxic_rus['comment'] = toxic_rus['comment'].str.replace("(","") toxic_rus['comment'] = toxic_rus['comment'].str.replace(")","")
class CrazyTokenizer(object): """ Tokenizer with Reddit- and Twitter-specific options Parameters ---------- lowercase : bool, optional If True, lowercase all tokens. Defaults to True. keepcaps: bool, optional If True, keep ALL CAPS WORDS uppercased. Defaults to False. normalize: int or bool, optional If not False, perform normalization of repeated charachers ("awesoooooome" -> "awesooome"). The value of parameter determines the number of occurences to keep. Defaults to 3. ignore_quotes: bool, optional If True, ignore tokens contained within double quotes. Defaults to False. ignore_reddit_quotes: bool, optional If True, remove quotes from the Reddit comments. Defaults to False. ignore_stopwords: str, list, or boolean, optional Whether to ignore stopwords - str: language to get a list of stopwords for from NLTK package - list: list of stopwords to remove - True: use built-in list of the english stop words - False: keep all tokens Defaults to False stem: {False, 'stem', 'lemm'}, optional Whether to perform word stemming - False: do not perform word stemming - 'stem': use PorterStemmer from NLTK package - 'lemm': use WordNetLemmatizer from NLTK package remove_punct: bool, optional If True, remove punctuation tokens. Defaults to True. remove_breaks: bool, optional If True, remove linebreak tokens. Defaults to True. decontract: bool, optional If True, attempt to expand certain contractions. Defaults to False. Example: "'ll" -> " will" numbers, subreddits, reddit_usernames, emails: False or str, optional Replacement of the different types of tokens - False: leaves these tokens intact - str: replacement token - '': removes all occurrences of these tokens twitter_handles: False, 'realname' or str, optional Processing of twitter handles - False: do nothing - str: replacement token - 'realname': replace with the real screen name of Twitter account - 'split': split handles using Viterbi algorithm Example: "#vladimirputinisthebest" -> "vladimir putin is the best" hashtags: False or str, optional Processing of hashtags - False: do nothing - str: replacement token - 'split': split hashtags according using Viterbi algorithm urls: False or str, optional Replacement of parsed URLs - False: leave URL intact - str: replacement token - dict: replace all URLs stored in keys with the corresponding values - '': removes all occurrences of these tokens - 'domain': extract domain ("http://cnn.com" -> "cnn") - 'domain_unwrap_fast': extract domain after unwraping links for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com) - 'domain_unwrap': extract domain after unwraping all links - 'title': extract and tokenize title of each link after unwraping it Defaults to False. extra_patterns: None or list of tuples, optional Replacement of any user-supplied extra patterns. Tuples must have the following form: (name, re_pattern, replacement_token): - name (str): name of the pattern - re_pattern (_sre.SRE_Pattern): compiled re pattern - replacement_token (str): replacement token Defaults to None keep_untokenized: None or list, optional List of expressions to keep untokenized Example: ["New York", "Los Angeles", "San Francisco"] whitespaces_to_underscores: boolean, optional If True, replace all whitespace characters with underscores in the final tokens. Defaults to True. remove_nonunicode: boolean, optional If True, remove all non-unicode characters. Defaults to False. pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional Replace positive, negative, and neutral emojis with the special tokens - None: do not perform replacement - True: perform replacement of the default lists of emojis - list: list of emojis to replace print_url_warnings: bool, optional If True, print URL-related warnings. Defaults to False. latin_chars_fix: bool, optional Try applying this fix if you have a lot of \\xe2\\x80\\x99-like or U+1F601-like strings in your data. Defaults to False. ngrams: int, optional Add ngrams of tokens after tokenizing """ def __init__(self, lowercase=True, keepcaps=False, normalize=3, ignore_quotes=False, ignore_reddit_quotes=False, ignore_stopwords=False, stem=False, remove_punct=True, remove_breaks=True, decontract=False, twitter_handles=False, urls=False, hashtags=False, numbers=False, subreddits=False, reddit_usernames=False, emails=False, extra_patterns=None, keep_untokenized=None, whitespaces_to_underscores=True, remove_nonunicode=False, pos_emojis=None, neg_emojis=None, neutral_emojis=None, print_url_warnings=False, latin_chars_fix=False, ngrams=1): self.params = locals() #self._nlp = English() self._nlp = Russian() russian_tokenizer = RussianTokenizer( self._nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self._nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') self._merging_matcher = Matcher(self._nlp.vocab) self._matcher = Matcher(self._nlp.vocab) self._replacements = {} self._domains = {} self._realnames = {} self._stopwords = None alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check) hashtag_flag = self._nlp.vocab.add_flag(hashtag_check) twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check) self._merging_matcher.add('HASHTAG', None, [{ 'ORTH': '#' }, { 'IS_ASCII': True }]) self._merging_matcher.add('SUBREDDIT', None, [{ 'ORTH': '/r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) self._merging_matcher.add('REDDIT_USERNAME', None, [{ 'ORTH': '/u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules): try: self._stopwords = stopwords.words(ignore_stopwords) except OSError: raise ValueError('Language {} was not found by NLTK'.format( ignore_stopwords)) elif ignore_stopwords is True: self._matcher.add('STOPWORDS', self._remove_token, [{ 'IS_STOP': True }]) elif isinstance(ignore_stopwords, list): self._stopwords = [word.lower() for word in ignore_stopwords] elif ignore_stopwords is not False: raise TypeError( 'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed' .format(type(ignore_stopwords))) if lowercase and (not keepcaps): self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False }]) elif lowercase and keepcaps: self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False, 'IS_UPPER': False }]) if remove_punct: self._matcher.add('PUNCTUATION', self._remove_token, [{ 'IS_PUNCT': True }]) if remove_breaks: def break_check(text): return bool(BREAKS_RE.fullmatch(text)) break_flag = self._nlp.vocab.add_flag(break_check) self._matcher.add('BREAK', self._remove_token, [{ break_flag: True }]) if normalize: def normalize_check(text): return bool(NORMALIZE_RE.search(text)) normalize_flag = self._nlp.vocab.add_flag(normalize_check) self._matcher.add('NORMALIZE', self._normalize, [{ normalize_flag: True }]) if numbers is not False: self._matcher.add('NUMBER', self._replace_token, [{ 'LIKE_NUM': True }]) self._replacements['NUMBER'] = numbers if urls is not False: if urls in [ 'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title' ]: self._urls = urls self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) elif isinstance(urls, dict): self._domains = urls self._urls = 'domain_unwrap_fast' self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) else: self._matcher.add('URL', self._replace_token, [{ 'LIKE_URL': True }]) self._replacements['URL'] = urls if emails is not False: self._matcher.add('EMAIL', self._replace_token, [{ 'LIKE_EMAIL': True }]) self._replacements['EMAIL'] = emails if reddit_usernames is not False: def reddit_username_check(text): return bool(REDDITORS_RE.fullmatch(text)) reddit_username_flag = self._nlp.vocab.add_flag( reddit_username_check) self._matcher.add('REDDIT_USERNAME', self._replace_token, [{ reddit_username_flag: True }]) self._replacements['REDDIT_USERNAME'] = reddit_usernames if subreddits is not False: def subreddit_check(text): return bool(SUBREDDITS_RE.fullmatch(text)) subreddit_flag = self._nlp.vocab.add_flag(subreddit_check) self._matcher.add('SUBREDDIT', self._replace_token, [{ subreddit_flag: True }]) self._replacements['SUBREDDIT'] = subreddits if twitter_handles is not False: self._matcher.add('TWITTER_HANDLE', self._handles_postprocess, [{ twitter_handle_flag: True }]) if hashtags is not False: self._matcher.add('HASHTAG', self._hashtag_postprocess, [{ hashtag_flag: True }]) if hashtags == 'split' or twitter_handles == 'split': file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt') with open(file) as f: self._words = f.read().split() self._wordcost = dict((k, log((i + 1) * log(len(self._words)))) for i, k in enumerate(self._words)) self._maxword = max(len(x) for x in self._words) if twitter_handles == 'realname': with open(os.path.join(DATA_PATH, 'realnames.json')) as f: self._realnames = json.load(f) if ignore_quotes: self._merging_matcher.add('QUOTE', None, [{ 'ORTH': '"' }, { 'OP': '*', 'IS_ASCII': True }, { 'ORTH': '"' }]) def doublequote_check(text): return bool(QUOTES_RE.fullmatch(text)) doublequote_flag = self._nlp.vocab.add_flag(doublequote_check) self._matcher.add('DOUBLE_QUOTES', self._remove_token, [{ doublequote_flag: True }]) if self._stopwords: def stopword_check(text): return bool(text.lower() in self._stopwords) stopword_flag = self._nlp.vocab.add_flag(stopword_check) self._matcher.add('STOPWORD', self._remove_token, [{ stopword_flag: True }]) if keep_untokenized is not None: if not isinstance(keep_untokenized, list): raise ValueError( "keep_untokenized has to be either None or a list") for i, phrase in enumerate(keep_untokenized): phrase_tokens = phrase.split(' ') rule = [] for token in phrase_tokens: rule.append({'LOWER': token.lower()}) self._merging_matcher.add('RULE_' + str(i), None, rule) if pos_emojis: if not isinstance(pos_emojis, list): pos_emojis = POS_EMOJIS pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis] self._matcher.add('HAPPY', self._replace_token, *pos_patterns) self._replacements['HAPPY'] = 'POS_EMOJI' if neg_emojis: if not isinstance(neg_emojis, list): neg_emojis = NEG_EMOJIS neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis] self._matcher.add('SAD', self._replace_token, *neg_patterns) self._replacements['SAD'] = 'NEG_EMOJI' if neutral_emojis: if not isinstance(neutral_emojis, list): neutral_emojis = NEUTRAL_EMOJIS neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis] self._matcher.add('NEUTRAL', self._replace_token, *neutral_patterns) self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI' if isinstance(extra_patterns, list): self._flags = {} for name, re_pattern, replacement_token in extra_patterns: def flag(text): return bool(re_pattern.match(text)) self._flags[name] = self._nlp.vocab.add_flag(flag) self._matcher.add(name, self._replace_token, [{ self._flags[name]: True }]) self._replacements[name] = replacement_token if stem and ('nltk' in sys.modules): if stem == 'stem': self._stemmer = PorterStemmer() elif stem == 'lemm': self._stemmer = WordNetLemmatizer() #elif stem == 'rus': # self._stemmer = SnowballStemmer("russian") else: raise ValueError( 'Stemming method {} is not supported'.format(stem)) self._matcher.add('WORD_TO_STEM', self._stem_word, [{ 'IS_ALPHA': True }]) retokenize_flag = self._nlp.vocab.add_flag(retokenize_check) self._matcher.add('RETOKENIZE', self._retokenize, [{ retokenize_flag: True, 'IS_PUNCT': False, 'LIKE_URL': False, 'LIKE_EMAIL': False, 'LIKE_NUM': False, hashtag_flag: False, twitter_handle_flag: False }]) self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True) self._nlp.add_pipe(self._match_doc, name='match_doc', last=True) self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True) @staticmethod def _lowercase(__, doc, i, matches): # Lowercase tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = tok._.transformed_text.lower() def _stem_word(self, __, doc, i, matches): # Stem tokens __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['stem'] == 'stem': tok._.transformed_text = self._stemmer.stem( tok._.transformed_text) elif self.params['stem'] == 'lemm': tok._.transformed_text = self._stemmer.lemmatize( tok._.transformed_text) def _normalize(self, __, doc, i, matches): # Normalize repeating symbols __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = NORMALIZE_RE.sub( r"\1" * self.params['normalize'], tok._.transformed_text) def _process_url(self, __, doc, i, matches): # Process found URLs __, start, end = matches[i] span = doc[start:end] for tok in span: found_urls = URLS_RE.findall(tok.text) if found_urls: if found_urls[0] in self._domains: tok._.transformed_text = self._domains[found_urls[0]] elif self._urls == 'domain': tok._.transformed_text = tldextract.extract( found_urls[0]).domain elif self._urls != 'title': if self._urls == 'domain_unwrap': domain = unshorten_url( found_urls[0], None, self.params['print_url_warnings']) else: domain = unshorten_url( found_urls[0], URL_SHORTENERS, self.params['print_url_warnings']) self._domains[found_urls[0]] = domain tok._.transformed_text = domain elif self._urls == 'title': domain = unshorten_url(found_urls[0], URL_SHORTENERS) if domain != 'twitter': title = get_url_title( found_urls[0], self.params['print_url_warnings']) title = self.tokenize(URLS_RE.sub('', title)) else: title = '' tok._.transformed_text = title self._domains[found_urls[0]] = title def _replace_token(self, __, doc, i, matches): # Replace tokens with something else match_id, start, end = matches[i] span = doc[start:end] replacement_token = self._replacements[doc.vocab.strings[match_id]] for tok in span: tok._.transformed_text = replacement_token @staticmethod def _remove_token(__, doc, i, matches): # Remove tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = '' def _retokenize(self, __, doc, i, matches): # Retokenize __, start, end = matches[i] span = doc[start:end] for tok in span: text = tok.text text = re.sub(r'([#@])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text).strip() #text = re.sub(r'\d+', '', text) tok._.transformed_text = self.tokenize(text) def _infer_spaces(self, text): # Infer location of spaces in hashtags text = text.lower() text = re.sub(r'[^\w\s]', '', text) def best_match(i): # Find the best match for the first i characters # assuming costs has been built for the first (i-1) characters candidates = enumerate(reversed(cost[max(0, i - self._maxword):i])) return min( (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1) for k, c in candidates) cost = [0] for i in range(1, len(text) + 1): cur_cost, k = best_match(i) cost.append(cur_cost) out = [] i = len(text) while i > 0: cur_cost, k = best_match(i) assert cur_cost == cost[i] out.append(text[i - k:i]) i -= k return list(reversed(out)) def _handles_postprocess(self, __, doc, i, matches): # Process twitter handles __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['twitter_handles'] == 'realname': if tok.text in self._realnames: tok._.transformed_text = self._realnames[tok.text] else: handle = get_twitter_realname(tok.text) realname = self.tokenize(TWITTER_HANDLES_RE.sub( '', handle)) tok._.transformed_text = realname self._realnames[tok.text] = realname elif self.params['twitter_handles'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['twitter_handles'] def _hashtag_postprocess(self, __, doc, i, matches): # Process hashtags __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['hashtags'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['hashtags'] @staticmethod def _decontract(text): # Expand contractions for contraction, decontraction in DECONTRACTIONS.items(): text = re.sub(contraction, decontraction, text) return text def _preprocess_text(self, text): # Do some preprocessing text = re.sub("’", "'", text) if self.params['remove_nonunicode']: try: text = text.encode('utf-8').decode('unicode-escape') text = ''.join(filter(lambda x: x in string.printable, text)).strip() except UnicodeDecodeError: warnings.warn( '(UnicodeDecodeError while trying to remove non-unicode characters' ) if self.params['decontract']: text = self._decontract(text) text = html.unescape(text) if self.params['latin_chars_fix']: if EMOJIS_UTF_RE.findall(text): text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text) for utf_code, emoji in EMOJIS_UTF.items(): text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text) if EMOJIS_UNICODE_RE.findall(text): text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text) for utf_code, emoji in EMOJIS_UNICODE.items(): text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text) if LATIN_CHARS_RE.findall(text): for _hex, _char in LATIN_CHARS.items(): text = LATIN_CHARS_PATS[_hex].sub(_char, text) if self.params['ignore_reddit_quotes']: text = REDDIT_QUOTES_RE.sub(text, ' ') text = text.replace('.@', '. @') text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text) return text.strip() def _merge_doc(self, doc): # Perform merging for certain types of tokens matches = self._merging_matcher(doc) spans = [] for __, start, end in matches: spans.append(doc[start:end]) for span in spans: span.merge() for tok in doc: tok._.transformed_text = tok.text return doc def _match_doc(self, doc): # Perform all additional processing self._matcher(doc) return doc def _postproc_doc(self, doc): # Perform postprocessing doc._.tokens = [] for tok in doc: if isinstance(tok._.transformed_text, list): doc._.tokens.extend(tok._.transformed_text) elif tok._.transformed_text.strip() != '': if self.params['whitespaces_to_underscores']: tok._.transformed_text = "_".join( tok._.transformed_text.split()) doc._.tokens.append(tok._.transformed_text.strip()) return doc def tokenize(self, text): """ Tokenize document Parameters ---------- text : str Document to tokenize Returns ------- list List of tokens Examples -------- >>> from redditscore.tokenizer import CrazyTokenizer >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False) >>> tokenizer.tokenize("#makeamericagreatagain") ["make", "america", "great", "again"] """ if not isinstance(text, str): warnings.warn('Document {} is not a string'.format(text)) return [] text = self._preprocess_text(text) doc = self._nlp(text) tokens = doc._.tokens if self.params['ngrams'] > 1: if self.params['whitespaces_to_underscores']: tokens = word_ngrams(tokens, (1, self.params['ngrams']), separator='_') else: tokens = word_ngrams(tokens, (1, self.params['ngrams'])) return tokens
import logging import pandas logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from spacy.lang.ru import Russian from spacy_russian_tokenizer import RussianTokenizer, MERGE_PATTERNS parser = Russian() russian_tokenizer = RussianTokenizer(parser, MERGE_PATTERNS) parser.add_pipe(russian_tokenizer, name='russian_tokenizer') def tokenize(text): lda_tokens = [] tokens = parser(text) print([token.text for token in tokens]) for token in tokens: if token.orth_.isspace(): continue elif token.like_url: lda_tokens.append('URL') elif token.orth_.startswith('@'): lda_tokens.append('SCREEN_NAME') else: lda_tokens.append(token.lower_) return lda_tokens
lang1, lang2 = args.dir.split('-') lang1_sents = [] lang2_sents = [] datasets = {} idmap = {} if lang1 == 'fr': l1nlp = spacy.load(lang1 + "_core_news_sm") if lang1 == 'de': l1nlp = spacy.load(lang1 + "_core_news_sm") if lang1 == 'ru': l1nlp = Russian() tokenizer = RussianTokenizer(l1nlp, MERGE_PATTERNS) l1nlp.add_pipe(tokenizer, name='russian_tokenizer') if lang2 == 'en': l2nlp = spacy.load(lang2 + "_core_web_sm") for fname in files: if 'gold' not in fname: fsplit = fname.split('.', 1)[1] datasets[fsplit] = {} with open(os.path.join(args.dir, fname), encoding='utf-8') as fp: for line in fp: line_id, text = line.split('\t', 1) line_id = int(line_id.split('-')[1]) datasets[fsplit][line_id] = text.strip() else: with open(os.path.join(args.dir, fname), encoding='utf-8') as fp: