Ejemplo n.º 1
0
def build_document_from_string(string):
    sentences = []
    paragraphs = []

    for line in string.strip().splitlines():
        line = line.lstrip()
        if line.startswith("# "):
            sentences.append(build_sentence(line[2:], is_heading=True))
        elif not line:
            paragraphs.append(Paragraph(sentences))
            sentences = []
        else:
            sentences.append(build_sentence(line))

    paragraphs.append(Paragraph(sentences))
    return ObjectDocumentModel(paragraphs)
Ejemplo n.º 2
0
    def document(self):
        # a abbr acronym b big blink blockquote cite code
        # dd del dfn dir dl dt em h h1 h2 h3 h4
        # h5 h6 i ins kbd li marquee menu ol pre q
        # s samp strike strong sub sup tt u ul var
        headers = 'h1', 'h2', 'h3'
        annotated_text = self._article.main_text
        paragraphs = []

        for paragraph in annotated_text:
            sentences, current_text = [], ''

            for (text, annotations) in paragraph:

                if annotations and any(h_tag in annotations
                                       for h_tag in headers):
                    sentences.append(
                        Sentence(text, self._tokenizer, is_heading=True))

                elif not (annotations and 'pre' in annotations):
                    # skip <pre> nodes
                    current_text += ' ' + text

            new_sentences = self.tokenize_sentences(current_text)
            sentences.extend(
                Sentence(s, self._tokenizer) for s in new_sentences)
            paragraphs.append(Paragraph(sentences))

        return ObjectDocumentModel(paragraphs)
Ejemplo n.º 3
0
def calc_quality(filename):

    with open(filename) as in_file:
        my_data = json.load(in_file)

    with open("stopwords.txt") as s_file:
        my_stopword_list = s_file.read().split()

    summarizer = lex_rank_modified.LexRankSummarizer()
    summarizer.stop_words = my_stopword_list

    matrix_dict = {}
    n_docs = 0
    for doc in my_data:
        processed_sents = list(map(to_sentence, doc['responses']))
        my_paragraphs = [Paragraph(processed_sents)]
        my_doc = ObjectDocumentModel(my_paragraphs)

        ratings = summarizer(my_doc)

        feat_matrix = numpy.matrix(ratings).transpose()
        qual_matrix = numpy.exp(feat_matrix)
        matrix_dict['q' + str(n_docs)] = qual_matrix
        n_docs += 1
        #print n_docs

    scipy.io.savemat('qual.mat', matrix_dict)
    return
Ejemplo n.º 4
0
def create_sumy_dom(text, tokenizer):
    """Creates a sumy style document from the sentences.
    **TODO: Assumes that paragraphs are specified by lines starting with a space """
    from sumy.models.dom import Sentence, Paragraph, ObjectDocumentModel

    paragraphs = []
    paragraph = []
    for ii, line in enumerate(text):
        if line[0] != ' ' and ii > 0:  # Last line was the last one in paragraph
            paragraphs.append(Paragraph(paragraph))  # Dump paragraph
            paragraph = []  # start new paragraph going forward
        # Process current line
        paragraph.append(Sentence(line, tokenizer))
        if ii + 1 == len(text):  # if last line just dump
            paragraphs.append(Paragraph(paragraph))

    return ObjectDocumentModel(tuple(paragraphs))
Ejemplo n.º 5
0
def build_document(*sets_of_sentences):
    paragraphs = []
    for sentences in sets_of_sentences:
        sentence_instances = []
        for sentence_as_string in sentences:
            sentence = build_sentence(sentence_as_string)
            sentence_instances.append(sentence)

        paragraphs.append(Paragraph(sentence_instances))

    return ObjectDocumentModel(paragraphs)
Ejemplo n.º 6
0
def test_only_instances_of_sentence_allowed():
    document = build_document_from_string("""
        Nějaký muž šel kolem naší zahrady
        Nějaký jiný muž šel kolem vaší zahrady

        # Nová myšlenka
        Už už abych taky šel
    """)

    with pytest.raises(TypeError):
        Paragraph(list(document.sentences) + ["Last sentence"])