Ejemplo n.º 1
0
def count_simple_stats():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    texts_count = 0
    sent_count = 0
    words_count = 0
    symbols_count = 0
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
        # print([token.txt for token in tokens])
    print("Texts count:", texts_count)
    print("Sentences count:", sent_count)
    print("Words count:", words_count)
    print("Symbols count:", symbols_count)
Ejemplo n.º 2
0
class SpacyTokenizer:
    def __init__(self):
        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS),
                          name="russian_tokenizer")

    def tokenize(self, text):
        return [token.text for token in self.nlp(text) if token.text.strip()]
Ejemplo n.º 3
0
def tokenize():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
Ejemplo n.º 4
0
class TimofeevTokenizer:
    label = 'aatimofeev/spacy_russian_tokenizer'

    def __init__(self):
        from spacy.lang.ru import Russian
        from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS,
                                             SYNTAGRUS_RARE_CASES)

        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(
            self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
                          name='russian_tokenizer')

    def __call__(self, text):
        doc = self.nlp(text)
        chunks = (token.text for token in doc)
        return find_substrings(chunks, text)
def text_decomposition(text, lang='de'):
    if lang == 'de':
        nlp = spacy.load('de_core_news_md')
    elif lang == 'en':
        nlp = spacy.load("en_core_web_md")
    elif lang == 'ru':
        nlp = Russian()
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)
    else:
        print("Unsupported language. Choose from ['en', 'de', 'ru']")
        return

    doc = nlp(text)
    sentences = list()
    for sent in doc.sents:
        sentences.append(sent.text)
    return sentences
Ejemplo n.º 6
0
def spacy_tokenize2(text):
    from spacy.lang.ru import Russian
    from spacy_russian_tokenizer import (
        RussianTokenizer,
        MERGE_PATTERNS,
        SYNTAGRUS_RARE_CASES
    )

    global NLP2
    if not NLP2:
        NLP2 = Russian()
        NLP2.add_pipe(
            RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            name='russian_tokenizer'
        )

    doc = NLP2(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
Ejemplo n.º 7
0
class RusWordTokenizer(PreProcesser):
    
    def __init__(self):
        
        self.rus_word_tokenizer = Russian()
        
        pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
        self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
    
    def transform_text(self, text):
        return [Token(token_id, token.text) for token_id, token in enumerate(self.rus_word_tokenizer(text), 1)]
    
    def transform_sent(self, sent):
        
        sent = sent.copy()
        sent.tokens = self.transform_text(sent.text)
        
        return sent
    
    def transform_item(self, x):
        return [self.transform_sent(sent) for sent in x]
Ejemplo n.º 8
0
    def spacy_sentence_scores(self) -> Dict[str, float]:
        nlp = Russian()
        sentencizer = nlp.create_pipe('sentencizer')
        nlp.add_pipe(sentencizer)

        raw_text = self.text
        docx = nlp(raw_text)
        stopwords = list(STOP_WORDS)

        word_frequencies = {}
        for word in docx:
            if word.text not in stopwords:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word not in word_frequencies.keys():
                        word_frequencies[word.word] = 1
                    else:
                        word_frequencies[word.word] += 1

        maximum_frequency = max(word_frequencies.values())

        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] / maximum_frequency)
        sentence_list = [sentence for sentence in docx.sents]

        sentence_scores = {}
        for sent in sentence_list:
            for word in sent:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word in word_frequencies.keys():
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.word]
                        else:
                            sentence_scores[sent] += word_frequencies[word.word]

        return sentence_scores
Ejemplo n.º 9
0
def tokenizer(inp):
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    return nlp(inp)
Ejemplo n.º 10
0
class SpacyRulesRussianTokenizer():
    """
    Tokenizer based on https://github.com/aatimofeev/spacy_russian_tokenizer.git
    Tokenizer was built on spacy and use spacy standart tokenization pipeline.
    You can read more about it here:
        * https://spacy.io/usage/linguistic-features#section-tokenization
        * https://spacy.io/usage/rule-based-matching
    Installation instruction:
    1) pip install spacy
    2) pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
    """
    def __init__(
            self,
            regexp_suffixes=BASE_SUFFIXES_REGEXPS,
            regexp_prefixes=BASE_PREFIXES_REGEXPS,
            regexp_infixes=BASE_INFIXES_REGEXPS,
            regexp_base_token_matches=BASE_TOKEN_MATCH,
            merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            terminal_patterns=tuple(NO_TERMINAL_PATTERNS),
    ):
        """
        Parameters
        ----------
        regexp_suffixes : list of dict
            Dict in spacy format. See above for explanation of spacy format.
        regexp_prefixes : list of dict
            Dict in spacy format.
        regexp_infixes : list of dict
            Dict in spacy format.
        regexp_base_token_matches : list of dict
            Dict in spacy format.
        merge_patterns : list of dict
            Dict in spacy format.
        terminal_patterns : list of dict
            Dict in spacy format.
        """
        merge_patterns = list(merge_patterns)
        terminal_patterns = list(terminal_patterns)

        self.nlp_pipeline = Russian()
        self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer(
            nlp_model=self.nlp_pipeline,
            prefix_regexp=regexp_prefixes,
            suffix_regexp=regexp_suffixes,
            infix_regexp=regexp_infixes,
            token_match_regexp=regexp_base_token_matches,
        )

        self.tokenizer_postprocesser = RussianTokenizer(
            self.nlp_pipeline,
            merge_patterns=merge_patterns,
            terminal_patterns=terminal_patterns)

        self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser,
                                   name='russian_tokenizer_postprocesser')

    @staticmethod
    def create_custom_pretokenizer(nlp_model, prefix_regexp, suffix_regexp,
                                   infix_regexp, token_match_regexp):
        custom_pretokenizer = SpacyBaseTokenizer(
            nlp_model.vocab,
            prefix_search=prefix_regexp.search,
            suffix_search=suffix_regexp.search,
            infix_finditer=infix_regexp.finditer,
            token_match=token_match_regexp.match,
        )
        return custom_pretokenizer

    def transform_element(self, element):
        """
        Get tokenization variant of the element.
        Parameters
        ----------
        element : str
            String, supposed to be a sentence, one document or something analogous.
        Return
        ------
        tokens_array : list of str
            Tokenized string
        """
        if not isinstance(element, str):
            raise TypeError(
                f"Cannot tokenize {type(element)} instead of {type('')}!")
        tokens_array = [token.text for token in self.nlp_pipeline(element)]
        return tokens_array

    def transform(self, elements_collection):
        """
        Apply transformer to collection of elements (objects).
        Parameters
        ----------
        elements_collection : iterable of optional
            Collection of objects to be transformed.
        Returns
        -------
        transformed_elements : list of optional
            Collection of transformed objects.
        """
        transformed_elements = [
            self.transform_element(element) for element in elements_collection
        ]
        return transformed_elements
Ejemplo n.º 11
0
from nltk.corpus import stopwords

import stanza
from spacy_stanza import StanzaLanguage

#nltk.download("stopwords")
#stanza.download('ru')  # will take a while
#russian_stopwords = stopwords.words("russian")
russian_stopwords = spacy.lang.ru.stop_words.STOP_WORDS

# ================================================== EXAMPLE ===================================================
text = "Не ветер, а какой-то ураган!"
nlp = Russian()
doc = nlp(text)
russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
doc = nlp(text)
print([token.text for token in doc])
# =============================================================================================================

toxic_rus = pd.read_csv("./__DATA/NLP_Datasets/toxic_russian.csv")

toxic_rus.head()
toxic_rus.info()

# Removing punctuation
toxic_rus['comment'] =  toxic_rus['comment'].str.replace(r'[.,!?<>-]', '')
toxic_rus['comment'] =  toxic_rus['comment'].str.replace("\n"," ")
toxic_rus['comment'] =  toxic_rus['comment'].str.replace("\t"," ")
toxic_rus['comment'] =  toxic_rus['comment'].str.replace("(","")
toxic_rus['comment'] =  toxic_rus['comment'].str.replace(")","")
Ejemplo n.º 12
0
class CrazyTokenizer(object):
    """
    Tokenizer with Reddit- and Twitter-specific options

    Parameters
    ----------
    lowercase : bool, optional
        If True, lowercase all tokens. Defaults to True.

    keepcaps: bool, optional
        If True, keep ALL CAPS WORDS uppercased. Defaults to False.

    normalize: int or bool, optional
        If not False, perform normalization of repeated charachers
        ("awesoooooome" -> "awesooome"). The value of parameter
        determines the number of occurences to keep. Defaults to 3.

    ignore_quotes: bool, optional
        If True, ignore tokens contained within double quotes.
        Defaults to False.

    ignore_reddit_quotes: bool, optional
        If True, remove quotes from the Reddit comments. Defaults to False.

    ignore_stopwords: str, list, or boolean, optional
        Whether to ignore stopwords

        - str: language to get a list of stopwords for from NLTK package
        - list: list of stopwords to remove
        - True: use built-in list of the english stop words
        - False: keep all tokens

        Defaults to False

    stem: {False, 'stem', 'lemm'}, optional
        Whether to perform word stemming

        - False: do not perform word stemming
        - 'stem': use PorterStemmer from NLTK package
        - 'lemm': use WordNetLemmatizer from NLTK package

    remove_punct: bool, optional
        If True, remove punctuation tokens. Defaults to True.

    remove_breaks: bool, optional
        If True, remove linebreak tokens. Defaults to True.

    decontract: bool, optional
        If True, attempt to expand certain contractions. Defaults to False.
        Example: "'ll" -> " will"

    numbers, subreddits, reddit_usernames, emails:
    False or str, optional
        Replacement of the different types of tokens

        - False: leaves these tokens intact
        - str: replacement token
        - '': removes all occurrences of these tokens

    twitter_handles: False, 'realname' or str, optional
        Processing of twitter handles

        - False: do nothing
        - str: replacement token
        - 'realname': replace with the real screen name of Twitter account
        - 'split': split handles using Viterbi algorithm

        Example: "#vladimirputinisthebest" -> "vladimir putin is the best"

    hashtags: False or str, optional
        Processing of hashtags

        - False: do nothing
        - str: replacement token
        - 'split': split hashtags according using Viterbi algorithm

    urls: False or str, optional
        Replacement of parsed URLs

        - False: leave URL intact
        - str: replacement token
        - dict: replace all URLs stored in keys with the corresponding values
        - '': removes all occurrences of these tokens
        - 'domain': extract domain ("http://cnn.com" -> "cnn")
        - 'domain_unwrap_fast': extract domain after unwraping links
        for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com)
        - 'domain_unwrap': extract domain after unwraping all links
        - 'title': extract and tokenize title of each link after unwraping it

        Defaults to False.

    extra_patterns: None or list of tuples, optional
        Replacement of any user-supplied extra patterns.
        Tuples must have the following form: (name, re_pattern, replacement_token):

        - name (str): name of the pattern
        - re_pattern (_sre.SRE_Pattern): compiled re pattern
        - replacement_token (str): replacement token

        Defaults to None

    keep_untokenized: None or list, optional
        List of expressions to keep untokenized

        Example: ["New York", "Los Angeles", "San Francisco"]

    whitespaces_to_underscores: boolean, optional
        If True, replace all whitespace characters with
        underscores in the final tokens. Defaults to True.

    remove_nonunicode: boolean, optional
        If True, remove all non-unicode characters. Defaults to False.

    pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional
        Replace positive, negative, and neutral emojis with the special tokens

        - None: do not perform replacement
        - True: perform replacement of the default lists of emojis
        - list: list of emojis to replace

    print_url_warnings: bool, optional
        If True, print URL-related warnings. Defaults to False.

    latin_chars_fix: bool, optional
        Try applying this fix if you have a lot of \\xe2\\x80\\x99-like
        or U+1F601-like strings in your data. Defaults to False.

    ngrams: int, optional
        Add ngrams of tokens after tokenizing
    """
    def __init__(self,
                 lowercase=True,
                 keepcaps=False,
                 normalize=3,
                 ignore_quotes=False,
                 ignore_reddit_quotes=False,
                 ignore_stopwords=False,
                 stem=False,
                 remove_punct=True,
                 remove_breaks=True,
                 decontract=False,
                 twitter_handles=False,
                 urls=False,
                 hashtags=False,
                 numbers=False,
                 subreddits=False,
                 reddit_usernames=False,
                 emails=False,
                 extra_patterns=None,
                 keep_untokenized=None,
                 whitespaces_to_underscores=True,
                 remove_nonunicode=False,
                 pos_emojis=None,
                 neg_emojis=None,
                 neutral_emojis=None,
                 print_url_warnings=False,
                 latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        #self._nlp = English()
        self._nlp = Russian()
        russian_tokenizer = RussianTokenizer(
            self._nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
        self._nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')

        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add('HASHTAG', None, [{
            'ORTH': '#'
        }, {
            'IS_ASCII': True
        }])
        self._merging_matcher.add('SUBREDDIT', None, [{
            'ORTH': '/r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }], [{
            'ORTH': 'r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }])
        self._merging_matcher.add('REDDIT_USERNAME', None,
                                  [{
                                      'ORTH': '/u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }], [{
                                      'ORTH': 'u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }])

        if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules):
            try:
                self._stopwords = stopwords.words(ignore_stopwords)
            except OSError:
                raise ValueError('Language {} was not found by NLTK'.format(
                    ignore_stopwords))
        elif ignore_stopwords is True:
            self._matcher.add('STOPWORDS', self._remove_token, [{
                'IS_STOP': True
            }])
        elif isinstance(ignore_stopwords, list):
            self._stopwords = [word.lower() for word in ignore_stopwords]
        elif ignore_stopwords is not False:
            raise TypeError(
                'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed'
                .format(type(ignore_stopwords)))

        if lowercase and (not keepcaps):
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False
            }])
        elif lowercase and keepcaps:
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False,
                'IS_UPPER': False
            }])

        if remove_punct:
            self._matcher.add('PUNCTUATION', self._remove_token,
                              [{
                                  'IS_PUNCT': True
                              }])

        if remove_breaks:

            def break_check(text):
                return bool(BREAKS_RE.fullmatch(text))

            break_flag = self._nlp.vocab.add_flag(break_check)
            self._matcher.add('BREAK', self._remove_token, [{
                break_flag: True
            }])

        if normalize:

            def normalize_check(text):
                return bool(NORMALIZE_RE.search(text))

            normalize_flag = self._nlp.vocab.add_flag(normalize_check)
            self._matcher.add('NORMALIZE', self._normalize,
                              [{
                                  normalize_flag: True
                              }])

        if numbers is not False:
            self._matcher.add('NUMBER', self._replace_token, [{
                'LIKE_NUM': True
            }])
            self._replacements['NUMBER'] = numbers

        if urls is not False:
            if urls in [
                    'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title'
            ]:
                self._urls = urls
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            elif isinstance(urls, dict):
                self._domains = urls
                self._urls = 'domain_unwrap_fast'
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            else:
                self._matcher.add('URL', self._replace_token, [{
                    'LIKE_URL': True
                }])
                self._replacements['URL'] = urls

        if emails is not False:
            self._matcher.add('EMAIL', self._replace_token, [{
                'LIKE_EMAIL': True
            }])
            self._replacements['EMAIL'] = emails

        if reddit_usernames is not False:

            def reddit_username_check(text):
                return bool(REDDITORS_RE.fullmatch(text))

            reddit_username_flag = self._nlp.vocab.add_flag(
                reddit_username_check)
            self._matcher.add('REDDIT_USERNAME', self._replace_token,
                              [{
                                  reddit_username_flag: True
                              }])
            self._replacements['REDDIT_USERNAME'] = reddit_usernames

        if subreddits is not False:

            def subreddit_check(text):
                return bool(SUBREDDITS_RE.fullmatch(text))

            subreddit_flag = self._nlp.vocab.add_flag(subreddit_check)
            self._matcher.add('SUBREDDIT', self._replace_token,
                              [{
                                  subreddit_flag: True
                              }])
            self._replacements['SUBREDDIT'] = subreddits

        if twitter_handles is not False:
            self._matcher.add('TWITTER_HANDLE', self._handles_postprocess,
                              [{
                                  twitter_handle_flag: True
                              }])

        if hashtags is not False:
            self._matcher.add('HASHTAG', self._hashtag_postprocess,
                              [{
                                  hashtag_flag: True
                              }])

        if hashtags == 'split' or twitter_handles == 'split':
            file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt')
            with open(file) as f:
                self._words = f.read().split()
            self._wordcost = dict((k, log((i + 1) * log(len(self._words))))
                                  for i, k in enumerate(self._words))
            self._maxword = max(len(x) for x in self._words)

        if twitter_handles == 'realname':
            with open(os.path.join(DATA_PATH, 'realnames.json')) as f:
                self._realnames = json.load(f)

        if ignore_quotes:
            self._merging_matcher.add('QUOTE', None, [{
                'ORTH': '"'
            }, {
                'OP': '*',
                'IS_ASCII': True
            }, {
                'ORTH': '"'
            }])

            def doublequote_check(text):
                return bool(QUOTES_RE.fullmatch(text))

            doublequote_flag = self._nlp.vocab.add_flag(doublequote_check)
            self._matcher.add('DOUBLE_QUOTES', self._remove_token,
                              [{
                                  doublequote_flag: True
                              }])

        if self._stopwords:

            def stopword_check(text):
                return bool(text.lower() in self._stopwords)

            stopword_flag = self._nlp.vocab.add_flag(stopword_check)
            self._matcher.add('STOPWORD', self._remove_token,
                              [{
                                  stopword_flag: True
                              }])

        if keep_untokenized is not None:
            if not isinstance(keep_untokenized, list):
                raise ValueError(
                    "keep_untokenized has to be either None or a list")
            for i, phrase in enumerate(keep_untokenized):
                phrase_tokens = phrase.split(' ')
                rule = []
                for token in phrase_tokens:
                    rule.append({'LOWER': token.lower()})
                self._merging_matcher.add('RULE_' + str(i), None, rule)

        if pos_emojis:
            if not isinstance(pos_emojis, list):
                pos_emojis = POS_EMOJIS
            pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis]
            self._matcher.add('HAPPY', self._replace_token, *pos_patterns)
            self._replacements['HAPPY'] = 'POS_EMOJI'

        if neg_emojis:
            if not isinstance(neg_emojis, list):
                neg_emojis = NEG_EMOJIS
            neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis]
            self._matcher.add('SAD', self._replace_token, *neg_patterns)
            self._replacements['SAD'] = 'NEG_EMOJI'

        if neutral_emojis:
            if not isinstance(neutral_emojis, list):
                neutral_emojis = NEUTRAL_EMOJIS
            neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis]
            self._matcher.add('NEUTRAL', self._replace_token,
                              *neutral_patterns)
            self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI'

        if isinstance(extra_patterns, list):
            self._flags = {}
            for name, re_pattern, replacement_token in extra_patterns:

                def flag(text):
                    return bool(re_pattern.match(text))

                self._flags[name] = self._nlp.vocab.add_flag(flag)
                self._matcher.add(name, self._replace_token,
                                  [{
                                      self._flags[name]: True
                                  }])
                self._replacements[name] = replacement_token

        if stem and ('nltk' in sys.modules):
            if stem == 'stem':
                self._stemmer = PorterStemmer()
            elif stem == 'lemm':
                self._stemmer = WordNetLemmatizer()
            #elif stem == 'rus':
            #    self._stemmer = SnowballStemmer("russian")
            else:
                raise ValueError(
                    'Stemming method {} is not supported'.format(stem))
            self._matcher.add('WORD_TO_STEM', self._stem_word,
                              [{
                                  'IS_ALPHA': True
                              }])

        retokenize_flag = self._nlp.vocab.add_flag(retokenize_check)
        self._matcher.add('RETOKENIZE', self._retokenize,
                          [{
                              retokenize_flag: True,
                              'IS_PUNCT': False,
                              'LIKE_URL': False,
                              'LIKE_EMAIL': False,
                              'LIKE_NUM': False,
                              hashtag_flag: False,
                              twitter_handle_flag: False
                          }])

        self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True)
        self._nlp.add_pipe(self._match_doc, name='match_doc', last=True)
        self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)

    @staticmethod
    def _lowercase(__, doc, i, matches):
        # Lowercase tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = tok._.transformed_text.lower()

    def _stem_word(self, __, doc, i, matches):
        # Stem tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['stem'] == 'stem':
                tok._.transformed_text = self._stemmer.stem(
                    tok._.transformed_text)
            elif self.params['stem'] == 'lemm':
                tok._.transformed_text = self._stemmer.lemmatize(
                    tok._.transformed_text)

    def _normalize(self, __, doc, i, matches):
        # Normalize repeating symbols
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = NORMALIZE_RE.sub(
                r"\1" * self.params['normalize'], tok._.transformed_text)

    def _process_url(self, __, doc, i, matches):
        # Process found URLs
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            found_urls = URLS_RE.findall(tok.text)
            if found_urls:
                if found_urls[0] in self._domains:
                    tok._.transformed_text = self._domains[found_urls[0]]
                elif self._urls == 'domain':
                    tok._.transformed_text = tldextract.extract(
                        found_urls[0]).domain
                elif self._urls != 'title':
                    if self._urls == 'domain_unwrap':
                        domain = unshorten_url(
                            found_urls[0], None,
                            self.params['print_url_warnings'])
                    else:
                        domain = unshorten_url(
                            found_urls[0], URL_SHORTENERS,
                            self.params['print_url_warnings'])
                    self._domains[found_urls[0]] = domain
                    tok._.transformed_text = domain
                elif self._urls == 'title':
                    domain = unshorten_url(found_urls[0], URL_SHORTENERS)
                    if domain != 'twitter':
                        title = get_url_title(
                            found_urls[0], self.params['print_url_warnings'])
                        title = self.tokenize(URLS_RE.sub('', title))
                    else:
                        title = ''
                    tok._.transformed_text = title
                    self._domains[found_urls[0]] = title

    def _replace_token(self, __, doc, i, matches):
        # Replace tokens with something else
        match_id, start, end = matches[i]
        span = doc[start:end]
        replacement_token = self._replacements[doc.vocab.strings[match_id]]
        for tok in span:
            tok._.transformed_text = replacement_token

    @staticmethod
    def _remove_token(__, doc, i, matches):
        # Remove tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = ''

    def _retokenize(self, __, doc, i, matches):
        # Retokenize
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            text = tok.text
            text = re.sub(r'([#@])', r' \1', text)
            text = re.sub(r'\s{2,}', ' ', text).strip()
            #text = re.sub(r'\d+', '', text)
            tok._.transformed_text = self.tokenize(text)

    def _infer_spaces(self, text):
        # Infer location of spaces in hashtags
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        def best_match(i):
            # Find the best match for the first i characters
            # assuming costs has been built for the first (i-1) characters
            candidates = enumerate(reversed(cost[max(0, i - self._maxword):i]))
            return min(
                (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1)
                for k, c in candidates)

        cost = [0]
        for i in range(1, len(text) + 1):
            cur_cost, k = best_match(i)
            cost.append(cur_cost)

        out = []
        i = len(text)
        while i > 0:
            cur_cost, k = best_match(i)
            assert cur_cost == cost[i]
            out.append(text[i - k:i])
            i -= k

        return list(reversed(out))

    def _handles_postprocess(self, __, doc, i, matches):
        # Process twitter handles
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['twitter_handles'] == 'realname':
                if tok.text in self._realnames:
                    tok._.transformed_text = self._realnames[tok.text]
                else:
                    handle = get_twitter_realname(tok.text)
                    realname = self.tokenize(TWITTER_HANDLES_RE.sub(
                        '', handle))
                    tok._.transformed_text = realname
                    self._realnames[tok.text] = realname
            elif self.params['twitter_handles'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['twitter_handles']

    def _hashtag_postprocess(self, __, doc, i, matches):
        # Process hashtags
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['hashtags'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['hashtags']

    @staticmethod
    def _decontract(text):
        # Expand contractions
        for contraction, decontraction in DECONTRACTIONS.items():
            text = re.sub(contraction, decontraction, text)
        return text

    def _preprocess_text(self, text):
        # Do some preprocessing
        text = re.sub("’", "'", text)
        if self.params['remove_nonunicode']:
            try:
                text = text.encode('utf-8').decode('unicode-escape')
                text = ''.join(filter(lambda x: x in string.printable,
                                      text)).strip()
            except UnicodeDecodeError:
                warnings.warn(
                    '(UnicodeDecodeError while trying to remove non-unicode characters'
                )
        if self.params['decontract']:
            text = self._decontract(text)
        text = html.unescape(text)

        if self.params['latin_chars_fix']:
            if EMOJIS_UTF_RE.findall(text):
                text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text)
                for utf_code, emoji in EMOJIS_UTF.items():
                    text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text)

            if EMOJIS_UNICODE_RE.findall(text):
                text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text)
                for utf_code, emoji in EMOJIS_UNICODE.items():
                    text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text)

            if LATIN_CHARS_RE.findall(text):
                for _hex, _char in LATIN_CHARS.items():
                    text = LATIN_CHARS_PATS[_hex].sub(_char, text)

        if self.params['ignore_reddit_quotes']:
            text = REDDIT_QUOTES_RE.sub(text, ' ')

        text = text.replace('.@', '. @')
        text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text)
        text = re.sub(r'\s{2,}', ' ', text)

        return text.strip()

    def _merge_doc(self, doc):
        # Perform merging for certain types of tokens
        matches = self._merging_matcher(doc)
        spans = []
        for __, start, end in matches:
            spans.append(doc[start:end])
        for span in spans:
            span.merge()
        for tok in doc:
            tok._.transformed_text = tok.text

        return doc

    def _match_doc(self, doc):
        # Perform all additional processing
        self._matcher(doc)
        return doc

    def _postproc_doc(self, doc):
        # Perform postprocessing
        doc._.tokens = []
        for tok in doc:
            if isinstance(tok._.transformed_text, list):
                doc._.tokens.extend(tok._.transformed_text)
            elif tok._.transformed_text.strip() != '':
                if self.params['whitespaces_to_underscores']:
                    tok._.transformed_text = "_".join(
                        tok._.transformed_text.split())
                doc._.tokens.append(tok._.transformed_text.strip())
        return doc

    def tokenize(self, text):
        """
        Tokenize document

        Parameters
        ----------
        text : str
            Document to tokenize

        Returns
        -------
        list
            List of tokens

        Examples
        --------
        >>> from redditscore.tokenizer import CrazyTokenizer
        >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False)
        >>> tokenizer.tokenize("#makeamericagreatagain")
        ["make", "america", "great", "again"]
        """
        if not isinstance(text, str):
            warnings.warn('Document {} is not a string'.format(text))
            return []
        text = self._preprocess_text(text)
        doc = self._nlp(text)
        tokens = doc._.tokens
        if self.params['ngrams'] > 1:
            if self.params['whitespaces_to_underscores']:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']),
                                     separator='_')
            else:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']))
        return tokens
Ejemplo n.º 13
0
import logging

import pandas

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

from spacy.lang.ru import Russian
from spacy_russian_tokenizer import RussianTokenizer, MERGE_PATTERNS

parser = Russian()
russian_tokenizer = RussianTokenizer(parser, MERGE_PATTERNS)
parser.add_pipe(russian_tokenizer, name='russian_tokenizer')


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    print([token.text for token in tokens])
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

Ejemplo n.º 14
0
lang1, lang2 = args.dir.split('-')

lang1_sents = []
lang2_sents = []

datasets = {}
idmap = {}

if lang1 == 'fr':
    l1nlp = spacy.load(lang1 + "_core_news_sm")
if lang1 == 'de':
    l1nlp = spacy.load(lang1 + "_core_news_sm")
if lang1 == 'ru':
    l1nlp = Russian()
    tokenizer = RussianTokenizer(l1nlp, MERGE_PATTERNS)
    l1nlp.add_pipe(tokenizer, name='russian_tokenizer')
if lang2 == 'en':
    l2nlp = spacy.load(lang2 + "_core_web_sm")

for fname in files:
    if 'gold' not in fname:
        fsplit = fname.split('.', 1)[1]
        datasets[fsplit] = {}
        with open(os.path.join(args.dir, fname), encoding='utf-8') as fp:
            for line in fp:
                line_id, text = line.split('\t', 1)
                line_id = int(line_id.split('-')[1])
                datasets[fsplit][line_id] = text.strip()

    else:
        with open(os.path.join(args.dir, fname), encoding='utf-8') as fp: