Example #1
0
def moses_sentenize(text):
    from mosestokenizer import MosesSentenceSplitter

    global MOSES_SENT
    if not MOSES_SENT:
        MOSES_SENT = MosesSentenceSplitter('ru')

    chunks = MOSES_SENT([text])
    return find_substrings(chunks, text)
Example #2
0
def spacy_tokenize(text):
    from spacy.lang.ru import Russian

    global NLP
    if not NLP:
        NLP = Russian()

    doc = NLP(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
Example #3
0
def moses_tokenize(text):
    from mosestokenizer import MosesTokenizer

    global MOSES_TOK
    if not MOSES_TOK:
        MOSES_TOK = MosesTokenizer('ru')
        # disable
        MOSES_TOK.argv.append('-no-escape')  # " -> "
        MOSES_TOK.argv.remove('-a')  # - -> @-@
        MOSES_TOK.restart()

    chunks = MOSES_TOK(text)
    return find_substrings(chunks, text)
Example #4
0
def mystem_tokenize(text):
    from pymystem3 import Mystem

    global MYSTEM
    if not MYSTEM:
        MYSTEM = Mystem(
            grammar_info=False,
            entire_input=True,
            disambiguation=False,
            weight=False
        )

    data = MYSTEM.analyze(text)
    chunks = parse_mystem(data)
    return find_substrings(chunks, text)
Example #5
0
def parse_tokens(stream):
    stream = filter_xml(stream, tags={'source', 'tokens', 'token'})
    buffer = []
    for event, node in stream:
        if event == 'end':
            tag = node.tag
            if tag == 'source':
                sent = node.text.strip()
            elif tag == 'token':
                word = node.get('text')
                buffer.append(word)
            elif tag == 'tokens':
                substrings = find_substrings(buffer, sent)
                yield substrings_partition(substrings)
                buffer = []
            node.clear()
Example #6
0
def parse_tokens(lines):
    lines = iter(lines)
    sent = None
    buffer = []
    for line in lines:
        if not line:
            substrings = find_substrings(buffer, sent)
            yield substrings_partition(substrings)
            buffer = []
        else:
            match = re.match(r'# text = (.+)$', line)
            if match:
                sent = match.group(1)
            if re.match(r'^\d', line):
                parts = line.split('\t')
                word = parts[1]
                buffer.append(word)
Example #7
0
def spacy_tokenize2(text):
    from spacy.lang.ru import Russian
    from spacy_russian_tokenizer import (
        RussianTokenizer,
        MERGE_PATTERNS,
        SYNTAGRUS_RARE_CASES
    )

    global NLP2
    if not NLP2:
        NLP2 = Russian()
        NLP2.add_pipe(
            RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            name='russian_tokenizer'
        )

    doc = NLP2(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
Example #8
0
 def __call__(self, text):
     parts = self.split(text)
     chunks = self.segment(parts)
     if self.post:
         chunks = self.post(chunks)
     return find_substrings(chunks, text)
Example #9
0
def nltk_tokenize(text):
    from nltk.tokenize import word_tokenize

    chunks = word_tokenize(text, 'russian')
    return find_substrings(chunks, text)
Example #10
0
def re_tokenize(text):
    chunks = TOKEN.findall(text)
    return find_substrings(chunks, text)
Example #11
0
def space_tokenize(text):
    chunks = re.split(r'\s+', text)
    return find_substrings(chunks, text)
Example #12
0
def segtok_sentenize(text):
    from segtok.segmenter import split_single

    chunks = split_single(text)
    return find_substrings(chunks, text)
Example #13
0
def nltk_sentenize(text):
    from nltk import sent_tokenize

    chunks = sent_tokenize(text, 'russian')
    return find_substrings(chunks, text)
Example #14
0
def deepmipt_sentenize(text):
    from rusenttokenize import ru_sent_tokenize

    with no_logger(LOGGER):
        chunks = ru_sent_tokenize(text)
    return find_substrings(chunks, text)
Example #15
0
def dot_sentenize(text):
    chunks = dot_sentenize_(text)
    return find_substrings(chunks, text)
Example #16
0
def segtok_tokenize(text):
    from segtok.tokenizer import word_tokenizer

    chunks = word_tokenizer(text)
    return find_substrings(chunks, text)