Python PunktSentenceTokenizer Examples

Programming Language: Python

Namespace/Package Name: wanish.tokenizers

Examples at hotexamples.com: 2

Python PunktSentenceTokenizer - 2 examples found. These are the top rated real world Python examples of wanish.tokenizers.PunktSentenceTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PunktSentenceTokenizer(2)

tokenize(2)

Example #1

Show file

def get_plain_text(cleaned_html_node, summary_sentences_qty):
    """
    Summarizes text from html element.

    :param cleaned_html_node: html node to extract text sentences
    :param summary_sentences_qty: quantity of sentences of summarized text
    :return: summarized text, two-digit language code
    """
    clean_text = ""

    # tokenizer for splitting text by sentences
    sent_tokenizer = PunktSentenceTokenizer()

    # assembling text only with complete sentences, ended with respective punctuations.
    for node in cleaned_html_node.iter('p'):
        if node.text is not None and len(node.text.strip(' \n\b\t')) > 0:
            sentences = sent_tokenizer.tokenize(node.text)
            for sentence in sentences:
                sentence = sentence.strip(' \n\b\t')
                if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \
                        not sentence.strip(' .!?…').isdigit():
                    clean_text = clean_text + ' ' + sentence

    # creating summary, obtaining language code and total sentences quantity
    final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty)

    return final_result, lang_code

Example #2

Show file

def textrank(text, hdr):
    sent_tokenizer = PunktSentenceTokenizer()
    sentences = sent_tokenizer.tokenize(text)
    word_tokenizer = RegexpTokenizer(r'\w+')

    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
    words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code