Exemple #1
0
    def split_to_sent_array(self, text, lang):
        spm_limit = self.spm_limit
        spm_processor = self.spm_processor
        _ = "▁"  # words in sentencepieces start with this weird unicode underscore

        def decode(x):
            """convert sequence of sentencepieces back to the original string"""
            return "".join(x).replace(_, " ")

        def limit_sp(n, s):
            """n: take first n sentencepieces. Don't split it inside of a word, rather take less sentencepieces.
            s: sequence of sentencepieces
            """
            n -= 1
            while 0 < n < len(s) - 1 and not s[n + 1].startswith(_):
                n -= 1
            return s[:n + 1]

        sent_array = []
        for sent in split_text_into_sentences(text=text, language=lang):
            sp_sent = spm_processor.EncodeAsPieces(sent)
            # splitting to chunks of 100 (default) subwords, at most
            while len(sp_sent) > spm_limit:
                part = limit_sp(spm_limit, sp_sent)
                sent_array.append(decode(part))
                sp_sent = sp_sent[len(part):]
            sent_array.append(decode(sp_sent))
        # print(len(sent_array), [len(x) for x in sent_array], [len(spm_processor.EncodeAsPieces(x)) for x in sent_array])
        return sent_array
Exemple #2
0
def on_request():
    if request.method == 'POST':
        data = request.get_json(force=True)
        source_text = preprocess(data.get("text"))
        source_lang = data.get("lang")
    else:
        source_text = request.args.get('text')
        source_lang = request.args.get('lang')
    if not source_text or not source_lang:
        return "Please provide the following parameters: text, lang", 400

    source_sentences = split_text_into_sentences(source_text, language=source_lang)
    target_sentences = []
    # translate each sentence individually
    for source_sent in source_sentences:
        target_sent = translate(source_sent, source_lang)
        target_sentences.append(target_sent)
    
    # merge translated sentences
    paraphrases = {}
    for language in LANGUAGES:
        paraphrase_in_lang = [para[language] for para in target_sentences]
        paraphrase_in_lang = ' '.join(paraphrase_in_lang)
        paraphrases[language] = paraphrase_in_lang

    return jsonify(paraphrases)
Exemple #3
0
def seg_text(
        text: str,
        lang: Optional[str] = None,
        qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
    # fmt: on
    """
    Split text to sentences.

    Use sentence_splitter if supported,
    else use polyglot.text.Text.sentences

    qmode: skip split_text_into_sentences if True, default False
        vectors for all books are based on qmode=False.
        qmode=True is for quick test purpose only

    maxlines (default 1000), threhold for turn on tqdm progressbar
        set to <1 or a large number to turn it off
    """
    if lang is None:
        try:
            lang = Detector(text).language.code
        except Exception as exc:
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'", exc
            )
            lang = "en"

    if not qmode and lang in LANG_S:
        _ = []
        lines = text.splitlines()
        # if maxlines > 1 and len(lines) > maxlines:
        if len(lines) > maxlines > 1:
            for para in tqdm(lines):
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        else:
            for para in lines:
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        return _

        # return split_text_into_sentences(text, lang)

    return [elm.string for elm in Text(text, lang).sentences]
def test_split_text_into_sentences():
    input_text = 'This is a paragraph. It contains several sentences. "But why," you ask?'
    expected_sentences = [
        'This is a paragraph.', 'It contains several sentences.',
        '"But why," you ask?'
    ]
    actual_sentences = split_text_into_sentences(text=input_text,
                                                 language='en')
    assert expected_sentences == actual_sentences
def seg_text(text: str, lang: str) -> List[str]:
    """ split text to sentences.

    use sentence_splitter if supported,
    else use polyglot.text.Text
    """
    if lang in LANG_S:
        return split_text_into_sentences(text, lang)

    return [elm.string for elm in Text(text, lang).sentences]
Exemple #6
0
def default_sentence_splitter(text, do_lower_case=True):
    split_sentences = split_text_into_sentences(
        text=text,
        language='en',
        non_breaking_prefix_file=pkg_resources.resource_filename(
            __name__, 'resource/custom_english_non_breaking_prefixes.txt'))
    if do_lower_case:
        return [i.lower() for i in split_sentences if i.strip() != '']
    else:
        return [i for i in split_sentences if i.strip() != '']
Exemple #7
0
def split_into_sentences(content, input_language):
    """
        Tasks with takes an input text and splits it into sentence 
        Providing the input_language
    """
    content = " ".join(content)
    sentences = split_text_into_sentences(text=content,
                                          language=input_language)

    for sentence in sentences:
        if sentence:
            print(sentence)
Exemple #8
0
def seg_text(text: str, lang: Optional[str] = None) -> List[str]:
    """split text to sentences.

    use sentence_splitter if supported,
    else use polyglot.text.Text
    """
    if lang is None:
        lang = Detector("testt 12 3").language.code

    if lang in LANG_S:
        return split_text_into_sentences(text, lang)

    return [elm.string for elm in Text(text, lang).sentences]
def tokenize_text(text_lines):
    sentences_tokens = []

    if not isinstance(text_lines, list):
        text_lines = [text_lines]

    for line in text_lines:
        sentences = split_text_into_sentences(line, language="fr")
        for sentence in sentences:
            tokens = MOSES_TOKENIZER.tokenize(sentence, aggressive_dash_splits=True, escape=False)
            sentences_tokens.append(tokens)

    return sentences_tokens
Exemple #10
0
    def _count_it_up(self):
        sentences = split_text_into_sentences(text=self.text, language=self.language)
        sentence_count = len(sentences)
        words = []

        for sentence in sentences:
            sentence_stripped = sentence.translate(str.maketrans('', '', string.punctuation))
            words += sentence_stripped.split()

        long_words_count = 0
        for word in words:
            if len(word) >= 7:
                long_words_count += 1

        self.word_count = len(words)
        self._sentence_count = sentence_count
        self._long_words_count = long_words_count
Exemple #11
0
 def split_to_sent_array(self, text, lang):
     charlimit = self.sent_chars_limit
     sent_array = []
     for sent in split_text_into_sentences(text=text, language=lang):
         while len(sent) > charlimit:
             try:
                 # When sent starts with a space, then sent[0:0] was an empty string,
                 # and it caused an infinite loop. This fixes it.
                 beg = 0
                 while sent[beg] == ' ':
                     beg += 1
                 last_space_idx = sent.rindex(" ", beg, charlimit)
                 sent_array.append(sent[0:last_space_idx])
                 sent = sent[last_space_idx:]
             except ValueError:
                 # raised if no space found by rindex
                 sent_array.append(sent[0:charlimit])
                 sent = sent[charlimit:]
         sent_array.append(sent)
     # print(len(sent_array), [len(x) for x in sent_array])
     return sent_array
Exemple #12
0
def parse_article(url):
    text = get_article(url)

    summary = summarizer.summarize(text)

    lang = detect(text)

    print(f'LANG: {lang}')

    sentences = split_text_into_sentences(text=text, language=lang)
    sentences = [s for s in sentences if s.strip()]

    tagger = pos_taggers[lang]

    words, markups = tagger(sentences)

    _keywords = keywords.keywords(text).split("\n")

    _keywords = [k for k in _keywords if not is_stop_word(k)]

    markups["KEYWORD"] = _keywords

    return words, markups, summary
Exemple #13
0
def breaksetences(text):
    content['sentences'] = split_text_into_sentences(
        text=text,
        language="pt",
    )
    return content['sentences']
Exemple #14
0
from sentence_splitter import SentenceSplitter, split_text_into_sentences

#
# Object interface
#
splitter = SentenceSplitter(language='tr')

with open('test.txt', 'r', encoding="utf8") as file:
    text = file.read().replace('\n', ' ').replace('\r', '')

#print(text)
#print(splitter.split(text=text))
# ['This is a paragraph.', 'It contains several sentences.', '"But why," you ask?']

#
# Functional interface
#
'''
print(split_text_into_sentences(
    text=text,
    language='tr'
))'''

list = split_text_into_sentences(text=text, language='tr')

with open('output.txt', 'w', encoding="utf8") as f:
    for item in list:
        f.write("%s\n" % item)
import requests
Exemple #16
0
            return
        yield start
        start += len(sub)


sentences = find_sentences(text)

for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

from sentence_splitter import SentenceSplitter, split_text_into_sentences
splitter = SentenceSplitter(
    language='ro', non_breaking_prefix_file='D:/BusuiocI/Downloads/ro.txt')
sentences = splitter.split(text=textwithoutdiacritics)
sentences2 = split_text_into_sentences(
    text=text,
    language='ro',
    non_breaking_prefix_file='D:/BusuiocI/Downloads/ro.txt')


def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + str(ent.start_char) + ' - ' +
                  str(ent.end_char) + ' - ' + ent.label_ + ' - ' +
                  str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')


show_ents(doc)
Exemple #17
0
import wikipedia

import re
from sentence_splitter import split_text_into_sentences

term = 'Steve Jobs'

summary = wikipedia.summary(term, sentences=7)

summary = re.sub(r"\([^)]*\)", "", summary)

result = split_text_into_sentences(
    text=summary,
    language="en",
)

subtitles_template = """
1
00:00:00,000 --> 00:00:10,000
{0}

2
00:00:10,000 --> 00:00:20,000
{1}

3
00:00:20,000 --> 00:00:30,000
{2}

4
00:00:30,000 --> 00:00:40,000