Python find_substrings Examples

Programming Language: Python

Namespace/Package Name: razdel.substring

Method/Function: find_substrings

Examples at hotexamples.com: 16

Python find_substrings - 16 examples found. These are the top rated real world Python examples of razdel.substring.find_substrings extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def moses_sentenize(text):
    from mosestokenizer import MosesSentenceSplitter

    global MOSES_SENT
    if not MOSES_SENT:
        MOSES_SENT = MosesSentenceSplitter('ru')

    chunks = MOSES_SENT([text])
    return find_substrings(chunks, text)

Example #2

Show file

def spacy_tokenize(text):
    from spacy.lang.ru import Russian

    global NLP
    if not NLP:
        NLP = Russian()

    doc = NLP(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)

Example #3

Show file

def moses_tokenize(text):
    from mosestokenizer import MosesTokenizer

    global MOSES_TOK
    if not MOSES_TOK:
        MOSES_TOK = MosesTokenizer('ru')
        # disable
        MOSES_TOK.argv.append('-no-escape')  # " -> &quot;
        MOSES_TOK.argv.remove('-a')  # - -> @-@
        MOSES_TOK.restart()

    chunks = MOSES_TOK(text)
    return find_substrings(chunks, text)

Example #4

Show file

def mystem_tokenize(text):
    from pymystem3 import Mystem

    global MYSTEM
    if not MYSTEM:
        MYSTEM = Mystem(
            grammar_info=False,
            entire_input=True,
            disambiguation=False,
            weight=False
        )

    data = MYSTEM.analyze(text)
    chunks = parse_mystem(data)
    return find_substrings(chunks, text)

Example #5

Show file

def parse_tokens(stream):
    stream = filter_xml(stream, tags={'source', 'tokens', 'token'})
    buffer = []
    for event, node in stream:
        if event == 'end':
            tag = node.tag
            if tag == 'source':
                sent = node.text.strip()
            elif tag == 'token':
                word = node.get('text')
                buffer.append(word)
            elif tag == 'tokens':
                substrings = find_substrings(buffer, sent)
                yield substrings_partition(substrings)
                buffer = []
            node.clear()

Example #6

Show file

File: syntag.py Project: xamgore/razdel

def parse_tokens(lines):
    lines = iter(lines)
    sent = None
    buffer = []
    for line in lines:
        if not line:
            substrings = find_substrings(buffer, sent)
            yield substrings_partition(substrings)
            buffer = []
        else:
            match = re.match(r'# text = (.+)$', line)
            if match:
                sent = match.group(1)
            if re.match(r'^\d', line):
                parts = line.split('\t')
                word = parts[1]
                buffer.append(word)

Example #7

Show file

def spacy_tokenize2(text):
    from spacy.lang.ru import Russian
    from spacy_russian_tokenizer import (
        RussianTokenizer,
        MERGE_PATTERNS,
        SYNTAGRUS_RARE_CASES
    )

    global NLP2
    if not NLP2:
        NLP2 = Russian()
        NLP2.add_pipe(
            RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            name='russian_tokenizer'
        )

    doc = NLP2(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)

Example #8

Show file

File: base.py Project: AbbaVaR/1191b_Abbazov_mobile

 def __call__(self, text):
     parts = self.split(text)
     chunks = self.segment(parts)
     if self.post:
         chunks = self.post(chunks)
     return find_substrings(chunks, text)

Example #9

Show file

def nltk_tokenize(text):
    from nltk.tokenize import word_tokenize

    chunks = word_tokenize(text, 'russian')
    return find_substrings(chunks, text)

Example #10

Show file

def re_tokenize(text):
    chunks = TOKEN.findall(text)
    return find_substrings(chunks, text)

Example #11

Show file

def space_tokenize(text):
    chunks = re.split(r'\s+', text)
    return find_substrings(chunks, text)

Example #12

Show file

def segtok_sentenize(text):
    from segtok.segmenter import split_single

    chunks = split_single(text)
    return find_substrings(chunks, text)

Example #13

Show file

def nltk_sentenize(text):
    from nltk import sent_tokenize

    chunks = sent_tokenize(text, 'russian')
    return find_substrings(chunks, text)

Example #14

Show file

def deepmipt_sentenize(text):
    from rusenttokenize import ru_sent_tokenize

    with no_logger(LOGGER):
        chunks = ru_sent_tokenize(text)
    return find_substrings(chunks, text)

Example #15

Show file

def dot_sentenize(text):
    chunks = dot_sentenize_(text)
    return find_substrings(chunks, text)

Example #16

Show file

def segtok_tokenize(text):
    from segtok.tokenizer import word_tokenizer

    chunks = word_tokenizer(text)
    return find_substrings(chunks, text)